In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

from torch.utils.data import DataLoader
from datasets import Dataset


from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

# Load Dataset

In [None]:
data = pd.read_csv("/content/Combined Data.csv", on_bad_lines='skip')
data.dropna(inplace = True)
data.drop(columns= ["Unnamed: 0"], axis = 1, inplace = True)
data.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [None]:
data = data.sample(n =6000, random_state = 42).reset_index(drop = True)

In [None]:
data

Unnamed: 0,statement,status
0,I'm lazy to complain about it ba ihh,Normal
1,i think the wifi on my iphone is broken it wil...,Normal
2,Good tracking apps? I've been trying to find a...,Bipolar
3,I have recently looked into reddit and found t...,Depression
4,that's your favorite thing to do?,Normal
...,...,...
5995,What is around you right now? A photo? A comfo...,Depression
5996,I am tired. I am ready for everything to be ov...,Suicidal
5997,? What if I am at a friends party and a man wh...,Stress
5998,if it add any kind of info m almost the thing ...,Depression


# Data Preprocessing

1. Clean


In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words  = set(stopwords.words('english'))

def clean_statement(text):
  # lower case
  text = text.lower()

  # remove everything except letter

  text = re.sub(r"[^a-zA-Z\s]", "", text)

  # tokenization (splitting)

  words = text.split()

  words = [word for word in words if word not in stop_words]

  return " ".join(words)



data["statement"] = data["statement"].apply(clean_statement)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data

Unnamed: 0,statement,status
0,im lazy complain ba ihh,Normal
1,think wifi iphone broken quot connect quot act...,Normal
2,good tracking apps ive trying find apps track ...,Bipolar
3,recently looked reddit found place actually qu...,Depression
4,thats favorite thing,Normal
...,...,...
5995,around right photo comfortable place sit beaut...,Depression
5996,tired ready everything life pointless painful ...,Suicidal
5997,friends party man whispers ear likes stare tou...,Stress
5998,add kind info almost thing ive always felt kin...,Depression


In [None]:
data["status"].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Normal,1894
Depression,1730
Suicidal,1219
Anxiety,429
Stress,304
Bipolar,293
Personality disorder,131


# 2 balance dataset


In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy="auto", random_state = 42)

X = data.drop("status", axis = 1)
y = data["status"]

X_resampled, y_resampled = ros.fit_resample(X, y)

data = pd.concat([X_resampled, y_resampled], axis = 1)

data["status"].value_counts()



Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Normal,1894
Bipolar,1894
Depression,1894
Suicidal,1894
Stress,1894
Personality disorder,1894
Anxiety,1894


3. Encoding


In [None]:
label_encoder = LabelEncoder()
data["label"] = label_encoder.fit_transform(data["status"])
data

Unnamed: 0,statement,status,label
0,im lazy complain ba ihh,Normal,3
1,think wifi iphone broken quot connect quot act...,Normal,3
2,good tracking apps ive trying find apps track ...,Bipolar,1
3,recently looked reddit found place actually qu...,Depression,2
4,thats favorite thing,Normal,3
...,...,...,...
13253,cannot afford therapist social anxiety depress...,Suicidal,6
13254,would like die simplei would really like suffe...,Suicidal,6
13255,death day time set affairs might even get bedr...,Suicidal,6
13256,hard seeing others live life would kill know w...,Suicidal,6


3 Train Test Sllit

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(data["statement"], data["label"], test_size = 0.2, random_state = 42)

3 Tokenization

In [None]:
max([len(text) for text in data["statement"]])

5893

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(list(train_texts), padding = True, truncation = True, max_length = 128)
test_encodings = tokenizer(list(test_texts), padding = True, truncation = True, max_length = 128)

train_encodings

In [None]:
train_encodings

In [None]:
#convert to Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels.tolist()
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels.tolist()
})



In [None]:
train_dataset[0]

Fine Model

In [None]:
len(label_encoder.classes_)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate less frequently
    save_strategy="epoch",       # Save checkpoints less often
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Smaller batch size
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,  # Log less frequently
    fp16=True,          # Mixed precision training
    gradient_accumulation_steps=2,
    warmup_steps=0,     # Disable warmup
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)



trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

#fine tuning the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.9273,0.47843
2,0.4735,0.431264


TrainOutput(global_step=7953, training_loss=0.8221371786318289, metrics={'train_runtime': 791.8564, 'train_samples_per_second': 40.182, 'train_steps_per_second': 10.043, 'total_flos': 2092353039974400.0, 'train_loss': 0.8221371786318289, 'epoch': 2.999057137469357})

Evaluation

In [None]:
#predictions



predictions, labels, _ = trainer.predict(test_dataset)

predicted_labels = np.argmax(predictions, axis = 1)

print(classification_report(test_labels, predicted_labels, target_names = label_encoder.classes_))

cm = confusion_matrix(test_labels, predicted_labels)

plt.figure(figsize = (10,7))
sns.heatmap(cm, annot = True, fmt = "d", cmap = "Blues", xticklabels = label_encoder.classes_, yticklabels = label_encoder.classes_)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

Save model and Load Model


In [None]:
trainer.save_model("/content/drive/MyDrive/Bert_Model")
tokenizer.save_pretrained("/content/drive/MyDrive/save_mental_status_bertl")

import pickle

pickle.dump(open("/content/drive/MyDrive/label_encoder.pkl", "wb"))

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Bert_Model")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/save_mental_status_bertl")

label_encoder = pickle.load(open("/content/drive/MyDrive/label_encoder.pkl", "rb"))


Detection System

In [None]:
def detection(text):
  text = clean_statement(text)
  inputs = tokenizer(text, padding = True, truncation = True, return_tensors = "pt",max_length = 128)
  outputs = model(**inputs)

  logits = output.logits
  predicted_label = torch.argmax(logits, dim = 1).item()

  return label_encoder.inverse_transform(predicted_label)[0]

text = "I am feeling very stressful and having headache"

detection(text)

In [None]:
!ls -lh

total 4.0K
drwxr-xr-x 1 root root 4.0K Jan  6 14:19 sample_data
