In [2]:
# Mount Google drive
from google.colab import drive
drive.mount('/content/drive')

#  working directory
import os
PROJECT_DIR = "/content/drive/MyDrive/Colab_Notebooks"

# Create the folder if it doesn’t exist (harmless if it already exists)
os.makedirs(PROJECT_DIR, exist_ok=True)

# Move into this directory
os.chdir(PROJECT_DIR)
print("Working directory:", os.getcwd())

Mounted at /content/drive
Working directory: /content/drive/MyDrive/Colab_Notebooks


In [13]:
# install required libraries
# Colab already has PyTorch, but we install Transformers, sklearn, etc.
!pip install -q transformers datasets scikit-learn pandas

# import required libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, DistilBertModel, Trainer, TrainingArguments
from datasets import Dataset
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [15]:
## Load the dataset ##
'''
Checks if the file is there.
Loads it with pandas.
Prints column names and shows a small preview.'''
# Path to your dataset file in Drive
DATASET_PATH = "/content/drive/MyDrive/Colab_Notebooks/depression_dataset_reddit_cleaned.csv"
# Check that the file really exists
print("Dataset exists?", os.path.exists(DATASET_PATH))
# Load the CSV file into a pandas DataFrame
df = pd.read_csv(DATASET_PATH)
# Show the names of the columns
print("Columns in the dataset:")
print(df.columns)
# Show the first 5 rows, just to see how the data looks
df.head()
#-------------------------------------------------------
## Basic cleaning: drop missing values ##
'''
Makes sure there are no empty texts or labels.
Keeps the dataset clean.'''
# Name of the columns we will use
TEXT_COL = "clean_text"      # column with the Reddit post text
LABEL_COL = "is_depression"  # column with 0/1 label
# Remove any rows where text or label is missing (NaN)
df = df.dropna(subset=[TEXT_COL, LABEL_COL])
# Show how many rows we have after dropping missing values
print("Number of rows after dropping missing values:", len(df))
# Look again at the first 5 rows to confirm it's okay
df.head()
#-------------------------------------------------------
## Look at class balance (how many 0 vs 1) ##
# Count how many samples in each class (0 = not depressed, 1 = depressed)
class_counts = df[LABEL_COL].value_counts()
print("Class counts (label -> number of examples):")
print(class_counts)
#-------------------------------------------------------
## Shuffle the data (mix the order) ##
'''
Often datasets are sorted (e.g., all depressed first, then all not depressed).
Shuffling avoids the model seeing all of one class first, which can bias training.'''
# Shuffle the rows of the DataFrame
# frac=1.0 means "use all rows"
# random_state is for reproducibility (same shuffle every time)
df_shuffled = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
# Check the first 5 rows after shuffling
df_shuffled.head()
#-------------------------------------------------------
## Split into train / validation / test sets
'''
Makes 3 splits with stratify so both classes stay balanced in each split.
70% train, 15% validation, 15% test'''
from sklearn.model_selection import train_test_split
# First split: 70% train, 30% temp (which will later be split into val + test)
train_df, temp_df = train_test_split(
    df_shuffled,
    test_size=0.3,                # 30% will go to temp_df
    stratify=df_shuffled[LABEL_COL],  # keep class balance
    random_state=42
)
# Second split: split temp_df into 15% val and 15% test
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,                  # half of 30% (so 15% of total) becomes test
    stratify=temp_df[LABEL_COL],    # keep class balance
    random_state=42
)
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))
#-------------------------------------------------------
## Save the splits as separate CSV files (in the same folder) ##
# File names for the splits
train_path = os.path.join(PROJECT_DIR, "train.csv")
val_path   = os.path.join(PROJECT_DIR, "val.csv")
test_path  = os.path.join(PROJECT_DIR, "test.csv")
# Save each split as a CSV file (without row index)
train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)
print("Saved files:")
print("Train:", train_path)
print("Val  :", val_path)
print("Test :", test_path)


Dataset exists? True
Columns in the dataset:
Index(['clean_text', 'is_depression'], dtype='object')
Number of rows after dropping missing values: 7731
Class counts (label -> number of examples):
is_depression
0    3900
1    3831
Name: count, dtype: int64
Train size: 5411
Val size: 1160
Test size: 1160
Saved files:
Train: /content/drive/MyDrive/Colab_Notebooks/train.csv
Val  : /content/drive/MyDrive/Colab_Notebooks/val.csv
Test : /content/drive/MyDrive/Colab_Notebooks/test.csv


In [16]:
## Define Dataset and DataLoaders ##
# The pre-trained model name from HuggingFace
MODEL_NAME = "distilbert-base-uncased"
# Define a custom Dataset class
'''
This class will:
Read a CSV file
Keep the text and labels
Use a tokenizer to convert text into IDs and attention masks'''
# Our custom dataset for depression text classification
class DepressionDataset(Dataset):
    def __init__(self, csv_path, text_col="clean_text", label_col="is_depression", max_length=128):
        # Read the CSV file
        self.df = pd.read_csv(csv_path)
        # Save text and labels as lists
        self.texts = self.df[text_col].astype(str).tolist()
        self.labels = self.df[label_col].astype(int).tolist()
        # Load the DistilBERT tokenizer
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
        # Maximum sequence length (number of tokens)
        self.max_length = max_length

    def __len__(self):
        # Return number of samples
        return len(self.texts)

    def __getitem__(self, idx):
        # Get one text and one label by index
        text = self.texts[idx]
        label = self.labels[idx]
        # Tokenize the text:
        # - convert to IDs
        # - pad/truncate to max_length
        # - create attention_mask
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        # Squeeze to remove the extra batch dimension
        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }
        return item

print("DepressionDataset class defined.")

#----------------------------------------------------
## Create DataLoaders for train / val / test ##
'''
The DataLoader will:
Give data in batches
Shuffle train data'''
def create_dataloaders(
    train_csv,
    val_csv,
    test_csv,
    batch_size=16,
    max_length=128
):
    # Create dataset objects for each split
    train_dataset = DepressionDataset(train_csv, max_length=max_length)
    val_dataset   = DepressionDataset(val_csv,   max_length=max_length)
    test_dataset  = DepressionDataset(test_csv,  max_length=max_length)
    # Create dataloaders from the datasets
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

# Actual creation of the dataloaders
train_loader, val_loader, test_loader = create_dataloaders(
    train_csv=train_path,
    val_csv=val_path,
    test_csv=test_path,
    batch_size=16,
    max_length=128
)

print("DataLoaders created.")
print("Train batches:", len(train_loader))
print("Validation batches:  ", len(val_loader))
print("Test batches: ", len(test_loader))

#--------------------------------------------------
## Inspect one batch to understand the shapes ##
# This is just to see what one batch looks like to understand what the model will receive.
# Get one batch from the train_loader
batch = next(iter(train_loader))
print("Keys in the batch:", batch.keys())
# The following will show: Batch size = 16, Each sequence length = 128 tokens
print("input_ids shape    :", batch["input_ids"].shape)
print("attention_mask shape:", batch["attention_mask"].shape)
print("labels shape       :", batch["labels"].shape) # 16 labels per batch
# Show the first 2 labels
print("First 2 labels:", batch["labels"][:2])

DepressionDataset class defined.
DataLoaders created.
Train batches: 339
Validation batches:   73
Test batches:  73
Keys in the batch: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids shape    : torch.Size([16, 128])
attention_mask shape: torch.Size([16, 128])
labels shape       : torch.Size([16])
First 2 labels: tensor([0, 1])


In [18]:
## Build and train the base model ##
'''
We’ll use: DistilBERT as encoder
A small linear layer on top for binary classification (0/1)'''
# If a GPU is available, use it. Otherwise use CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

## Define the model (DistilBERT + classifier head) ##
# We use the same model name as before (for the tokenizer)
MODEL_NAME = "distilbert-base-uncased"

class DepressionClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_labels=2, dropout_prob=0.2):
        super().__init__()
        # Load pre-trained DistilBERT
        self.bert = DistilBertModel.from_pretrained(MODEL_NAME)
        # Dropout helps prevent overfitting
        self.dropout = nn.Dropout(dropout_prob)
        # Final linear layer: from hidden_size -> 2 classes (0 or 1)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        # Get outputs from DistilBERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # outputs.last_hidden_state shape: (batch_size, seq_len, hidden_size)
        last_hidden_state = outputs.last_hidden_state
        # Take the embedding of the first token as the sentence representation
        cls_embedding = last_hidden_state[:, 0, :]  # shape: (batch_size, hidden_size)
        # Apply dropout
        x = self.dropout(cls_embedding)
        # Get logits for each class
        logits = self.classifier(x)  # shape: (batch_size, 2)
        return logits
print("Model class defined.")

#--------------------------------------------
## Create the model instance ##
# Create the model object
model = DepressionClassifier()
# Move model to the device (CPU or GPU)
model.to(device)
print("Model created and moved to device.")

#---------------------------------------------
## Test the model with one batch (sanity check) ##
# This is to make sure shapes work correctly.
# If this runs without error and you see a shape like [16, 2], we’re good.
# Get one batch from the train loader
batch = next(iter(train_loader))
# Move batch tensors to device
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
# Forward pass through the model
with torch.no_grad():
    logits = model(input_ids=input_ids, attention_mask=attention_mask)
print("Logits shape:", logits.shape)  # should be (batch_size, 2)
print("Example logits (first 2 rows):")
print(logits[:2])

#-------------------------------------------------
## Define loss function and optimizer ##
# Loss function for classification (suitable for logits + integer labels)
criterion = nn.CrossEntropyLoss()
# AdamW optimizer (common for transformers)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
print("Loss function and optimizer are ready.")

#------------------------------------------------
## Define one epoch of training ##
'''
This function will:
Put model in train mode
Loop over batches
Compute loss and update weights
Track average loss and accuracy'''
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    # Set model to training mode
    model.train()
    total_loss = 0.0
    all_preds = []
    all_labels = []
    # Loop over all batches
    for batch in dataloader:
        # Move data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        # Reset gradients
        optimizer.zero_grad()
        # Forward pass
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        # Compute loss
        loss = criterion(logits, labels)
        # Backward pass (compute gradients)
        loss.backward()
        # Update weights
        optimizer.step()
        # Add batch loss
        total_loss += loss.item() * input_ids.size(0)
        # Get predicted classes (0 or 1)
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        all_preds.extend(preds)
        # True labels
        all_labels.extend(labels.cpu().numpy())

    # Compute average loss and accuracy over the epoch
    avg_loss = total_loss / len(dataloader.dataset)
    acc = accuracy_score(all_labels, all_preds)
    return avg_loss, acc
print("train_one_epoch function defined.")

#------------------------------------------------------
##  Define evaluation function (validation / test) ##
# Same idea, but no gradient updates
def evaluate(model, dataloader, criterion, device):
    # Set model to evaluation mode
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []
    # We don't need gradients during evaluation
    with torch.no_grad():
        for batch in dataloader:
            # Move data to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            # Forward pass
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            # Loss
            loss = criterion(logits, labels)
            total_loss += loss.item() * input_ids.size(0)
            # Predictions
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    # Average loss and accuracy
    avg_loss = total_loss / len(dataloader.dataset)
    acc = accuracy_score(all_labels, all_preds)
    return avg_loss, acc
print("evaluate function defined.")

#---------------------------------------------------
## Train the model for a few epochs ##
''' We’ll start with 10 epochs to keep it light.
Later we can increase to 5 or more if we want.
We should see the loss and accuracy values printed for each epoch.'''
num_epochs = 5  # we can change this later
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    # Train on training data
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    # Evaluate on validation data
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f"  Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f}")
    print(f"  Val   loss: {val_loss:.4f}, Val   acc: {val_acc:.4f}")

#--------------------------------------------------------------
## Final test performance and save the model ##
'''
This gives a Final test accuracy for the base model.
A saved model file in my Drive '''
# Evaluate on test set
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"Final Test loss: {test_loss:.4f}, Test acc: {test_acc:.4f}")
# Path to save the model in your Drive folder
model_path = os.path.join(PROJECT_DIR, "depression_base_model.pt")
# Save model weights
torch.save(model.state_dict(), model_path)
print("Saved base model to:", model_path)

Using device: cpu
Model class defined.
Model created and moved to device.
Logits shape: torch.Size([16, 2])
Example logits (first 2 rows):
tensor([[ 0.1426, -0.0832],
        [ 0.2609,  0.4497]])
Loss function and optimizer are ready.
train_one_epoch function defined.
evaluate function defined.
Epoch 1/5


KeyboardInterrupt: 