In [1]:
import numpy as np
import pandas as pd
import os, re
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F



In [2]:
print(torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))


2.3.0+cu121
Using device: cuda
Tesla T4


In [3]:
df = pd.read_excel('PHQ9DepressionNLP_elaborated.xlsx')
print('Number of training sentences: {:,}\n'.format(df.shape[0]))
df.sample(10)

Number of training sentences: 520



Unnamed: 0,AcademicDescription,ViolenceDescription,SocialDescription,PHQDescription,GeneralDescription,PressureDescription,PHQ9_levels
436,I am currently in my 2 year with a CGPA of 2.1...,"I Agree that I experienced physical violence,...",I Neutral that My understanding with my paren...,"I often feel a lack of interest in activities,...","I am 21 years old, studying in my 2 year with ...",I Neutral that my Parents are Strict. I Neut...,Moderate
272,I am currently in my 1 year with a CGPA of 2.5...,I Strongly disagree that I experienced physic...,I Neutral that My understanding with my paren...,"I often feel a lack of interest in activities,...","I am 23 years old, studying in my 1 year with ...",I Agree that my Parents are Strict. I Neutra...,Mild
230,I am currently in my 4 year with a CGPA of 3.4...,I Disagree that I experienced physical violen...,I Disagree that My understanding with my pare...,"I often feel a lack of interest in activities,...","I am 22 years old, studying in my 4 year with ...",I Disagree that my Parents are Strict. I Dis...,Moderate
256,I am currently in my 1 year with a CGPA of 2.1...,I Strongly disagree that I experienced physic...,I Neutral that My understanding with my paren...,"I often feel a lack of interest in activities,...","I am 20 years old, studying in my 1 year with ...",I Strongly agree that my Parents are Strict. ...,Mild
270,I am currently in my 1 year with a CGPA of 3.7...,I Strongly disagree that I experienced physic...,I Strongly disagree that My understanding wit...,"I often feel a lack of interest in activities,...","I am 35 years old, studying in my 1 year with ...",I Not Applicable that my Parents are Strict. ...,Mild
35,I am currently in my 3 year with a CGPA of 3.0...,I Disagree that I experienced physical violen...,I Neutral that My understanding with my paren...,"I often feel a lack of interest in activities,...","I am 23 years old, studying in my 3 year with ...",I Agree that my Parents are Strict. I Disagr...,Minimal
245,I am currently in my 3 year with a CGPA of 3.0...,I Disagree that I experienced physical violen...,I Disagree that My understanding with my pare...,"I often feel a lack of interest in activities,...","I am 23 years old, studying in my 3 year with ...",I Neutral that my Parents are Strict. I Neut...,Minimal
56,I am currently in my 4 year with a CGPA of 3.7...,I Strongly disagree that I experienced physic...,I Disagree that My understanding with my pare...,"I often feel a lack of interest in activities,...","I am 22 years old, studying in my 4 year with ...",I Strongly agree that my Parents are Strict. ...,Moderate
466,I am currently in my 2 year with a CGPA of 3.2...,I Strongly disagree that I experienced physic...,I Strongly disagree that My understanding wit...,"I often feel a lack of interest in activities,...","I am 21 years old, studying in my 2 year with ...",I Strongly agree that my Parents are Strict. ...,Minimal
321,I am currently in my 1 year with a CGPA of 2.5...,I Strongly disagree that I experienced physic...,I Neutral that My understanding with my paren...,"I often feel a lack of interest in activities,...","I am 21 years old, studying in my 1 year with ...",I Neutral that my Parents are Strict. I Stro...,Mild


In [4]:
df['statements'] = df.apply(lambda row: ' '.join([str(row['PHQDescription']),
                                                  str(row['ViolenceDescription']),
                                                  str(row['PressureDescription']),
                                                  str(row['SocialDescription']),
                                                  str(row['AcademicDescription'])]), axis=1)


In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
class PHQ9Dataset(Dataset):
    def __init__(self, descriptions, labels, tokenizer, max_len=512):
        self.descriptions = descriptions
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        description = self.descriptions[idx]
        inputs = self.tokenizer.encode_plus(
            description,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return input_ids, attention_mask, torch.tensor(self.labels[idx], dtype=torch.long)

In [7]:
descriptions = df['statements'].tolist()
label_map = {'Minimal': 0, 'Mild': 1, 'Moderate': 2, 'ModeratelySevere': 3, 'Severe': 4}
labels = df['PHQ9_levels'].map(label_map).tolist()
dataset = PHQ9Dataset(descriptions, labels, tokenizer)
train_data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [20]:
class GPT2ForSequenceClassification(torch.nn.Module):
    def __init__(self, gpt2_model, num_labels):
        super(GPT2ForSequenceClassification, self).__init__()
        self.gpt2 = gpt2_model
        self.classifier = torch.nn.Linear(gpt2_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt2(input_ids, attention_mask=attention_mask, labels=None)

        # Extract hidden states or logits based on the model's output type
        if isinstance(outputs, tuple):
            # In this case, GPT2LMHeadModel is used, and we need to access the first element
            hidden_states = outputs[0]
        else:
            # If using a different model variant, check the structure of outputs
            hidden_states = outputs.hidden_states[-1]  # Assuming last layer's hidden states

        cls_representation = hidden_states[:, 0, :]  # (batch_size, hidden_size)
        logits = self.classifier(cls_representation)  # (batch_size, num_labels)

        return logits


In [21]:
num_labels = len(label_map)
classification_model = GPT2ForSequenceClassification(model, num_labels)
classification_model = classification_model.to(device)

In [22]:
optimizer = AdamW(classification_model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_data_loader) * 5  # Assuming 5 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = torch.nn.CrossEntropyLoss()

In [23]:
epochs = 5
for epoch in range(epochs):
    classification_model.train()
    total_loss = 0.0

    for batch in train_data_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = classification_model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_data_loader)
    print(f'Epoch [{epoch + 1}/{epochs}], Average Train Loss: {avg_train_loss:.4f}')

TypeError: 'NoneType' object is not subscriptable

In [None]:
# Evaluation on test data (assuming you have a test DataLoader named test_data_loader)
classification_model.eval()
test_predictions, test_labels = [], []
with torch.no_grad():
    for batch in test_data_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        logits = classification_model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        test_predictions.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

In [None]:
# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}, Test Recall: {recall:.4f}, Test F1 Score: {f1:.4f}')