# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import evaluate

# Load Dataset

In [2]:
df_train = pd.read_csv("/workspaces/codespaces-jupyter/data/mental_heath_unbanlanced.csv")

In [3]:
df_test = pd.read_csv("/workspaces/codespaces-jupyter/data/mental_health_combined_test.csv")

In [4]:
df_train

Unnamed: 0,Unique_ID,text,status
0,0.0,oh my gosh,Anxiety
1,1.0,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2.0,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3.0,I've shifted my focus to something else but I'...,Anxiety
4,4.0,"I'm restless and restless, it's been a month n...",Anxiety
...,...,...,...
49607,,i can't explain it but i know that i don't wan...,Depression
49608,,nobody ever told me that when i started treatm...,Depression
49609,,my wife and i split up in 2012/2013. she had a...,Depression
49610,,A close family member committed suicideI just ...,Suicidal


In [5]:
df_train['status'].value_counts()

status
Normal        18391
Depression    14506
Suicidal      11212
Anxiety        5503
Name: count, dtype: int64

In [6]:
label_map = {
    "Normal":0,
    "Anxiety": 1,
    "Depression": 2,
    "Suicidal": 3
}

df_train["label"] = df_train["status"].map(label_map)
df_test["label"] = df_test["status"].map(label_map)


In [7]:
df_train_sampled = df_train.groupby('label').sample(n=25, random_state=42)
display(df_train_sampled['label'].value_counts())

label
0    25
1    25
2    25
3    25
Name: count, dtype: int64

In [8]:
df_test_sampled = df_train.groupby('label').sample(n=50, random_state=42)
display(df_test_sampled['label'].value_counts())

label
0    50
1    50
2    50
3    50
Name: count, dtype: int64

In [9]:
train_ds = df_train_sampled[["text", "label"]]
val_ds  = df_test_sampled[["text", "label"]]

# Load Pretrained BERT

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4
)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


# Tokenization

In [11]:
def tokenize_f(batch):
    return tokenizer(
        batch['text'],
        truncation=True,
        padding = "max_length",
        max_length=256
    )

In [12]:
from datasets import Dataset

# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(train_ds)
fh_dataset = Dataset.from_pandas(val_ds)

# Now, use the map method from the Hugging Face datasets library
tokenized_dataset = hf_dataset.map(
    tokenize_f,
    batched=True,
    remove_columns=["text"]
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [13]:
tokenized_dataset_test = fh_dataset.map(
    tokenize_f,
    batched=True,
    remove_columns=["text"]
)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [14]:
tokenized_dataset.set_format("torch")
tokenized_dataset_test.set_format("torch")


# Define Evaluation Metrics

In [15]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")


In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision.compute(predictions=predictions, references=labels, average="weighted")["precision"],
        "recall": recall.compute(predictions=predictions, references=labels, average="weighted")["recall"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"],
    }

# Training Arguments

In [17]:
training_args = TrainingArguments(
    output_dir="./bert-sentiment",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    eval_strategy="steps",
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

# Trainer

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_test,
    compute_metrics=compute_metrics
)

# Train the Model

In [19]:
trainer.train()



Step,Training Loss,Validation Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=35, training_loss=1.266311536516462, metrics={'train_runtime': 494.5742, 'train_samples_per_second': 1.011, 'train_steps_per_second': 0.071, 'total_flos': 65778945024000.0, 'train_loss': 1.266311536516462, 'epoch': 5.0})

# Final Evaluation

In [24]:
results = trainer.evaluate()
print(results)


{'eval_loss': 1.1866683959960938, 'eval_accuracy': 0.67, 'eval_precision': 0.7139332706766918, 'eval_recall': 0.67, 'eval_f1': 0.669108669108669, 'eval_runtime': 52.763, 'eval_samples_per_second': 3.791, 'eval_steps_per_second': 0.246, 'epoch': 5.0}


In [25]:
import torch
import torch.nn.functional as F

def predict_sentiment(text):
    model.eval()

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probs = F.softmax(logits, dim=1)
    predicted_class = torch.argmax(probs, dim=1).item()

    return {
        "text": text,
        "predicted_label": predicted_class,
        "confidence": probs[0][predicted_class].item()
    }


    "Normal":0,
    "Anxiety": 1,
    "Depression": 2,
    "Suicidal": 3

In [26]:
predict_sentiment("i  want everythinthing i just want peace")


{'text': 'i  want everythinthing i just want peace',
 'predicted_label': 2,
 'confidence': 0.3049008548259735}