<a href="https://colab.research.google.com/github/subham73/depression_detection/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pytorch-lightning
!pip install fasttext

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.3.3-py3-none-any.whl (812 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.3/812.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.3.post0-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.0.0->pytorch-lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.0.0->pytorch-lightning)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.0

In [None]:
import os
import pytorch_lightning as pl

## 1. Data Preparation

Kaggle dataset = [Depression: Reddit Dataset (Cleaned)](https://www.kaggle.com/datasets/infamouscoder/depression-reddit-cleaned/data)

Column : text | is_depression 1:*0*

In [None]:
# !unzip "/content/drive/MyDrive/depression_classification/archive.zip" -d "/content/drive/MyDrive/depression_classification/"

Archive:  /content/drive/MyDrive/depression_classification/archive.zip
replace /content/drive/MyDrive/depression_classification/depression_dataset_reddit_cleaned.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
"""Set the directory path inside your drive"""
PROCESSED_DATA_DIR = "/content/drive/MyDrive/depression_classification/data"

In [None]:
"""Util Functions"""

import os

def check_if_exists(path):
  """
  Checks if a file or folder exists in the Google Drive.
  Args:
    path: The path to the file or folder.
  Returns:
    True if the file or folder exists, False otherwise.
  """
  drive_path = "/content/drive/MyDrive/"
  full_path = os.path.join(drive_path, path)
  return os.path.exists(full_path)

################################################################################

import pandas as pd
from sklearn.model_selection import train_test_split

def make_dataset():
  """
  Prepares train and test data.csv files, saves them to PROCESSED_DATA_DIR.
  """
  data = pd.read_csv("/content/drive/MyDrive/depression_classification/depression_dataset_reddit_cleaned.csv")
  print("full dataset shape: ", data.shape)

  data = data.sample(frac = 0.6, random_state=0)
  print("sample dataset shape: ", data.shape)

  train, test = train_test_split(data, test_size=0.2, random_state=0)
  print("train dataset shape: ", train.shape)
  print("test dataset shape: ", test.shape)

  train.to_csv(os.path.join(PROCESSED_DATA_DIR, "train.csv"), index=False)
  test.to_csv(os.path.join(PROCESSED_DATA_DIR, "test.csv"), index=False)
  print("sample training and testing dataset created and saved at PROCESSED_DATA_DIR...")

################################################################################

import re
def preprocess(x):
  """
  Preprocesses the text data.
  Args:
    x: The text data.
  Returns:
    The preprocessed text data.
  """
  # Make text lower case
  x = x.lower()

  # Remove tags of other people
  x = re.sub(r"@\w*", " ", x)

  # Remove special characters
  x = re.sub(r"#|^\*|\*$|&quot;|&gt;|&lt;|&lt;3", " ", x)
  x = x.replace("&amp;", " and ")

  # Remove links
  x = re.sub(r"ht+p+s?://\S*", " ", x)

  # Remove non-ascii
  x = re.sub(r"[^\x00-\x7F]", " ", x)

  # Remove time
  x = re.sub(r"((a|p).?m)?\s?(\d+(:|.)?\d+)\s?((a|p).?m)?", " ", x)

  # Remove brackets if left after removing time
  x = re.sub(r"\(\)|\[\]|\{\}", " ", x)

  # For words we want to keep at least two occurences of
  #  each word(e.g not change good to god)
  # x = re.sub(r"([a-z])\1+", r"\1\1", x)

  # Remove any string that starts with number
  x = re.sub(r"\d[\w]*", " ", x)

  # Remove all special characters left
  x = re.sub(r"[^a-zA-Z0-9 ]", "", x)

  # Remove single letters that left except i and a
  x = re.sub(r"\s[b-gj-z]\s", " ", x)

  # Remove multiple space chars
  x = " ".join(x.split()).strip()

  return x

In [None]:
"""Managing All Data Operation"""

from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
  def __init__(self, data: pd.DataFrame):
    self.data = data
    self.texts = list(data.clean_text)
    self.labels = list(data.is_depression)

  def __len__(self):
    """Denotes the total number of samples"""
    return len(self.data)

  def __getitem__(self, index):
    return {'text': self.texts[index],
            'label': self.labels[index]}

################################################################################

from tqdm.auto import tqdm
tqdm.pandas()

class CustomDataModule(pl.LightningDataModule):
  def __init__(self,
                batch_size: int = 32,
                split: float = 0.1,
                recreate_data: bool = False):
    super().__init__()

    self.batch_size = batch_size
    self.split = split
    self.recreate_data = recreate_data
    self.has_setup = False

  def prepare_data(self):
    if self.recreate_data or not check_if_exists(PROCESSED_DATA_DIR) or os.listdir(PROCESSED_DATA_DIR) == 0:
      make_dataset()

  def setup(self, stage: str):
    """
    Assign train/val datasets for use in dataloaders
    """
    if stage == "fit":
      if not self.has_setup:
        self.has_setup = True

        data = self.process_data(stage)
        # split data
        self.train_data, self.val_data = train_test_split(data, test_size=self.split, random_state=0)

        self.train_dataset = CustomDataset(self.train_data)
        self.val_dataset = CustomDataset(self.val_data)

      if stage in (None, "test", "predict"):
        self.test_data = self.process_data(stage)
        self.test_dataset = CustomDataset(self.test_data)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size=self.batch_size)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size)

  def predict_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size)

  def process_data(self, stage: str):
    """
    Help setup process by providing required data after preprocessing it
    """
    file_name = None

    if stage == "fit":
      file_name = "train.csv"

    if stage in (None, "test", "predict"):
      file_name = "test.csv"

    if file_name is None:
      raise ValueError(f"Stage {stage} is not valid.")

    data = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, file_name))
    print("process_Data runned with data shape", data.shape)
    data["clean_text"] = data.clean_text.progress_apply(lambda x: preprocess(x))
    return data

## 2. Building Model

In [161]:
import torch
import torch.nn.functional as F
from torch import nn

class FCL(nn.Module):
  def __init__( self, embeddings_matrix: torch.Tensor):
    super().__init__()

    self.embedding_dim = embeddings_matrix.shape[1]
    self.embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embeddings_matrix))
    self.sequential_model = nn.Sequential(
        # nn.Embedding.from_pretrained(torch.FloatTensor(embeddings_matrix)), #torch.Size([6682, 300])
        # nn.Dropout(0.25),
        nn.Conv1d(in_channels=self.embedding_dim, out_channels=128, kernel_size=3),
        nn.ReLU(),
        nn.MaxPool1d(kernel_size= 2),
        nn.Dropout(0.25),
        nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3),
        nn.ReLU(),
        nn.MaxPool1d(kernel_size= 2),
        nn.Flatten(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.LSTM(input_size=64, hidden_size=64, num_layers=1, batch_first=True),
        nn.Flatten(),
        nn.Linear(64, 2)
    )

  def forward(self, x):
    seq, mask, lengths = x[0], x[1], x[2]
    emb_seq = self.embedding_layer(seq)
    emb_seq = emb_seq.transpose(1, 2)
    return self.sequential_model(emb_seq)

  @property
  def name(self):
    return self.__class__.__name__


In [None]:
fcl_model = FCL(embeddings_matrix)

In [None]:
fcl_model

FCL(
  (sequential_model): Sequential(
    (0): Embedding(6682, 300)
    (1): Dropout(p=0.25, inplace=False)
    (2): Conv1d(300, 128, kernel_size=(3,), stride=(1,))
    (3): ReLU()
    (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Dropout(p=0.25, inplace=False)
    (6): Conv1d(128, 64, kernel_size=(3,), stride=(1,))
    (7): ReLU()
    (8): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Flatten(start_dim=1, end_dim=-1)
    (10): Linear(in_features=64, out_features=64, bias=True)
    (11): ReLU()
    (12): LSTM(64, 64, batch_first=True)
    (13): Flatten(start_dim=1, end_dim=-1)
    (14): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [None]:
import torch
from torchtext.data import get_tokenizer
from tqdm import tqdm
import json


class Tokenizer:
    def __init__(self):
        self.vocab = {}
        self.inv_vocab = {}
        self.tokenizer = get_tokenizer('basic_english')
        self.special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

    def fit_on_texts_and_embeddings(self, sentences: list[str], embeddings):
        # Add special tokens to the start of the vocab
        for special_token in self.special_tokens:
            self.vocab[special_token] = len(self.vocab)

        # Add each unique word in the sentences to the vocab if there is also an embedding for it
        for sentence in tqdm(sentences):
            for word in self.tokenizer(sentence):
                if word not in self.vocab and word in embeddings.words:
                    self.vocab[word] = len(self.vocab)

        self.inv_vocab = {v: k for k, v in self.vocab.items()}

    # create an embbeding matrix from a set of pretrained embeddings based on the vocab
    def get_embeddings_matrix(self, embeddings):
        # Create a matrix of zeroes of the shape of the vocab size
        embeddings_matrix = torch.zeros((len(self.vocab), embeddings.get_dimension()))

        # For each word in the vocab get its index and add its embedding to the matrix
        for word, idx in self.vocab.items():
            if word in self.special_tokens:
                continue
            if word in embeddings.words:
                embeddings_matrix[idx] = torch.tensor(embeddings.get_word_vector(word))
            else:
                raise KeyError(f"Word {word} not in embeddings. Please create tokenizer based on embeddings")

        # Initialize the <pad> token with the mean of the embeddings of the vocab
        embeddings_matrix[1] = torch.mean(embeddings_matrix[len(self.special_tokens):], dim=0)

        # Initialize the <sos> and <eos> tokens with the mean of the embeddings of the vocab
        # plus or minus a small amount of noise to avoid them matching the <unk> token
        # and avoiding having identical embeddings which the model can not distinguish
        noise = torch.normal(mean=0, std=0.1, size=(embeddings.get_dimension(),))
        embeddings_matrix[2] = torch.mean(embeddings_matrix[len(self.special_tokens):] + noise, dim=0)
        embeddings_matrix[3] = torch.mean(embeddings_matrix[len(self.special_tokens):] - noise, dim=0)

        return embeddings_matrix

    # add start of sentence and end of sentence tokens to the tokenizer sentence
    def add_special_tokens(self, tokens):
        return ["<sos>"] + tokens + ["<eos>"]

    # convert a sequence of words to a sequence of indices based on the vocab
    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(token, self.vocab['<unk>']) for token in tokens]


    def pad_sequences(self, sequences, max_length=None):
        # Pads the vectorized sequences

        # If max_length is not specified, pad to the length of the longest sequence
        if not max_length:
            max_length = max(len(seq) for seq in sequences)

        # Create a tensor for the lengths of the sequences
        sequence_lengths = torch.LongTensor([min(len(seq), max_length) for seq in sequences])

        # Create a tensor for the sequences with zeros
        seq_tensor = torch.zeros((len(sequences), max_length)).long()

        # Create a tensor for the masks with zeros
        seq_mask = torch.zeros((len(sequences), max_length)).long()

        # For each sequence add the values to the seq_tensor
        #  and add 1s to the seq_mask according to its length
        for idx, (seq, seq_len) in enumerate(zip(sequences, sequence_lengths)):
            # truncate the sequence if it exceeds the max length
            seq = seq[:seq_len]

            seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
            seq_mask[idx, :seq_len] = torch.LongTensor([1])

        return seq_tensor, seq_mask, sequence_lengths

    # split the text into tokens
    def tokenize(self, text):
        return self.tokenizer(text)

    def encode(self, texts, max_length=None):
        if isinstance(texts, str):
            texts = [texts]

        sequences = []
        for text in texts:
            tokens = self.tokenize(text)
            tokens = self.add_special_tokens(tokens)
            ids = self.convert_tokens_to_ids(tokens)
            sequences.append(ids)

        seq_tensor, seq_mask, sequence_lengths = self.pad_sequences(sequences, max_length)

        return seq_tensor, seq_mask, sequence_lengths

    # save the tokenizer to a json file
    def save(self, file_path: str, filename: str = "tokenizer.json"):
        if not os.path.exists(file_path):
            os.makedirs(file_path, exist_ok=True)
        json_data = {}
        with open(os.path.join(file_path, filename), 'w') as tokenizer_file:
            json_data["vocab"] = self.vocab
            json_data["inv_vocab"] = self.inv_vocab
            json_data["special_tokens"] = self.special_tokens
            json.dump(json_data, tokenizer_file)
            # logger.info(f"Successfully saved tokenizer {os.path.join(file_path, filename)}")

    # load the tokenizer from a json file
    def load(self, file_path: str, filename: str = "tokenizer.json"):
        if os.path.exists(file_path):
            with open(os.path.join(file_path, filename)) as tokenizer_file:
                json_data = json.load(tokenizer_file)
                self.vocab = json_data["vocab"]
                self.inv_vocab = json_data["inv_vocab"]
                self.special_tokens = json_data["special_tokens"]
                # logger.info(f"Successfully loaded tokenizer from {os.path.join(file_path, filename)}")
        else:
            raise FileNotFoundError("The file path does not exist")

    def __call__(self, texts, max_length=None):
        return self.encode(texts, max_length)

In [None]:
import fasttext
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="facebook/fasttext-et-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.bin:   0%|          | 0.00/7.24G [00:00<?, ?B/s]

In [None]:
textDataMoudle = CustomDataModule()
textDataMoudle.prepare_data()
textDataMoudle.setup(stage = "fit")

process_Data runned with data shape (3711, 2)


  0%|          | 0/3711 [00:00<?, ?it/s]

In [155]:
train_load = textDataMoudle.train_dataloader()
cnt = 0
sam = None
for batch in train_load:
    cnt = cnt+1
    texts = batch['text']
    labels = batch['label']
    sam = texts
    # print(texts[0])
    # print(labels[0])
    break
# print("numberr of lines", cnt)

In [None]:
pretrained_embeddings = model
tokenizer = Tokenizer()
# tokenizer.fit_on_texts_and_embeddings(textDataMoudle.train_data.clean_text, pretrained_embeddings)
# tokenizer.save("/content/drive/MyDrive/depression_classification/")

In [None]:
tokenizer.load("/content/drive/MyDrive/depression_classification/")
embeddings_matrix = tokenizer.get_embeddings_matrix(pretrained_embeddings)

In [160]:
print(sam)
print(len(sam))
it = tokenizer(sam)
print(type(it))
print(len(it))
print(it[0].shape)
print(it[1].shape)
print(it[2].shape)

['i can tell you how many time i ve hoped for someone to show up to a place i at and shoot me maybe then i ll make it onto the news maybe then someone will give a damn maybe', 'idk what it is with depression but it always make you feel like the bad guy me breath me right after what a po always breathing wrong', 'went to the doctor today and my blood sugar level wa', 'my night went to the bar felt up a marred woman went home hard and alone', 'poor sock luvvvvv the golden retriever i want one sighhhh', 'ok wonder why twitpix isn an option for this new phone i got i can win i ll leave the photo to my cuz i guess', 'pilvlp my luck i probably get stopped by a cop or something stupid', 'imohumoren that where i have a problem not knowing who is who till i got con of usd con another neighbour of sold the gen in my family house now have till april end before i get kicked out all i have now is depression desperation without smoking', 'wwwicked i think i have tried everything but feel free to try

In [149]:
from torch.optim import Adam
from torchmetrics.classification import Accuracy

class DepressionDetection(pl.LightningModule):
  def __init__(self,
               model: nn.Module,
               tokenizer: Tokenizer,
               learning_rate= 0.01,
               ):
    super().__init__()
    self.model = model
    self.tokenizer = tokenizer
    self.learning_rate = learning_rate
    self.criterion = nn.CrossEntropyLoss()
    self.train_accuracy = Accuracy(task='binary')
    self.my_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.model.to(self.my_device)

  def forward(self, x):
    input = self.tokenizer(x)
    input = tuple([i.to(self.device) for i in input])
    return self.model(input).view(-1)

  def training_step(self, batch, batch_idx):
    # training_step defines the train loop.
    x, y = batch["clean_text"], batch["is_depression"]

    pred = self(x)
    loss = self.criterion(pred, y.float())

    self.train_accuracy(pred, y)

    self.log('train_acc', self.train_accuracy, on_step=True, on_epoch=True, prog_bar=True)
    self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)

    return loss

  def validation_step(self, batch, batch_idx):
    # this is the validation loop
    self._shared_eval(batch, batch_idx, "val")

  def test_step(self, batch, batch_idx):
    # this is the test loop
    self._shared_eval(batch, batch_idx, "test")

  def _shared_eval(self, batch, batch_idx, prefix):
    x, y = batch["text"], batch["label"]

    pred = self(x)
    loss = self.criterion(pred, y.float())

    if prefix == "val":
      self.val_accuracy(pred, y)
      self.log(f"{prefix}_acc", self.val_accuracy, on_step=True, on_epoch=True, prog_bar=True)

    if prefix == "test":
      self.test_accuracy(pred, y)
      self.log(f"{prefix}_acc", self.test_accuracy, on_step=True, on_epoch=True, prog_bar=True)

    self.log(f"{prefix}_loss", loss, on_step=True, on_epoch=True, prog_bar=True)

  def predict_step(self, batch, batch_idx, dataloader_idx=0):
    x, _ = batch["tweet"], batch["label"]

    pred = self(x)
    return pred

  def configure_optimizers(self):
    optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
    return optimizer


In [74]:
import os
import sys
from logging import config
from pytorch_lightning.callbacks import TQDMProgressBar, EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

tqdm.pandas(file=sys.stdout)


def get_callbacks():
    tqdm_callback = TQDMProgressBar(refresh_rate=1)
    checkpoint_callback = ModelCheckpoint(save_last=True, save_top_k=1,
                                          filename="best-loss-model-{epoch:02d}-{val_loss:.2f}",
                                          monitor="val_loss",
                                          mode="min")
    checkpoint_callback.CHECKPOINT_NAME_LAST = "last-model-{epoch:02d}-{val_loss:.2f}"
    early_stopping_callback = EarlyStopping(monitor="val_loss", mode="min", patience=5, verbose=True)

    return [tqdm_callback, checkpoint_callback, early_stopping_callback]

In [106]:
from pytorch_lightning import Trainer

EXPERIMENTS_DIR = "/content/drive/MyDrive/depression_classification/exp"

def train():

  textDataMoudle = CustomDataModule()
  textDataMoudle.prepare_data()
  textDataMoudle.setup(stage = "fit")

  # pretrained_embeddings already loaded
  #TODO: will implement inside it

  tokenizer = Tokenizer()
  # tokenizer.fit_on_texts_and_embeddings(textDataMoudle.train_data.clean_text, pretrained_embeddings)
  tokenizer.load("/content/drive/MyDrive/depression_classification/")
  # embeddings_matrix = tokenizer.get_embeddings_matrix(pretrained_embeddings)  # already created


  model = FCL(embeddings_matrix)
  depression_detection = DepressionDetection(model=model, tokenizer=tokenizer)

  tb_logger = TensorBoardLogger(EXPERIMENTS_DIR, name=model.name)
  output_dir = os.path.join(EXPERIMENTS_DIR, model.name, f"version_{tb_logger.version}")
  # tokenizer.save(output_dir)

  trainer = Trainer(devices="auto",
                      accelerator="auto",
                      max_epochs=3,
                      callbacks=get_callbacks(),
                      logger=tb_logger,
                      log_every_n_steps=1
                      )

  trainer.fit(model=depression_detection, datamodule=textDataMoudle)



In [162]:
train()

process_Data runned with data shape (3711, 2)

  0%|          | 0/3711 [00:00<?, ?it/s][A
 28%|██▊       | 1055/3711 [00:00<00:00, 10548.25it/s][A
 57%|█████▋    | 2110/3711 [00:00<00:00, 10548.55it/s][A
100%|██████████| 3711/3711 [00:00<00:00, 10460.19it/s]


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type             | Params | Mode 
------------------------------------------------------------
0 | model          | FCL              | 2.2 M  | train
1 | criterion      | CrossEntropyLoss | 0      | train
2 | train_accuracy | BinaryAccuracy   | 0      | train
------------------------------------------------------------
177 K     Trainable params
2.0 M     Non-trainable params
2.2 M     Total params
8.729     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x6400 and 64x64)