<a href="https://colab.research.google.com/github/subham73/depression_detection/blob/main/running.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pytorch-lightning
!pip install fasttext

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.3.3-py3-none-any.whl (812 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/812.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/812.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.3/812.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.5-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.0.0->pytorch-lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_6

In [3]:
import os
import pytorch_lightning as pl

## 1. Data Preparation

Kaggle dataset = [Depression: Reddit Dataset (Cleaned)](https://www.kaggle.com/datasets/infamouscoder/depression-reddit-cleaned/data)

Column : text | is_depression 1:*0*

In [None]:
# !unzip "/content/drive/MyDrive/depression_classification/archive.zip" -d "/content/drive/MyDrive/depression_classification/"

Archive:  /content/drive/MyDrive/depression_classification/archive.zip
replace /content/drive/MyDrive/depression_classification/depression_dataset_reddit_cleaned.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [4]:
"""Set the directory path inside your drive"""
PROCESSED_DATA_DIR = "/content/drive/MyDrive/depression_classification/data"

In [5]:
"""Util Functions"""

import os

def check_if_exists(path):
  """
  Checks if a file or folder exists in the Google Drive.
  Args:
    path: The path to the file or folder.
  Returns:
    True if the file or folder exists, False otherwise.
  """
  drive_path = "/content/drive/MyDrive/"
  full_path = os.path.join(drive_path, path)
  return os.path.exists(full_path)

################################################################################

import pandas as pd
from sklearn.model_selection import train_test_split

def make_dataset():
  """
  Prepares train and test data.csv files, saves them to PROCESSED_DATA_DIR.
  """
  data = pd.read_csv("/content/drive/MyDrive/depression_classification/depression_dataset_reddit_cleaned.csv")
  print("full dataset shape: ", data.shape)

  data = data.sample(frac = 0.6, random_state=0)
  print("sample dataset shape: ", data.shape)

  train, test = train_test_split(data, test_size=0.2, random_state=0)
  print("train dataset shape: ", train.shape)
  print("test dataset shape: ", test.shape)

  train.to_csv(os.path.join(PROCESSED_DATA_DIR, "train.csv"), index=False)
  test.to_csv(os.path.join(PROCESSED_DATA_DIR, "test.csv"), index=False)
  print("sample training and testing dataset created and saved at PROCESSED_DATA_DIR...")

################################################################################

import re
def preprocess(x):
  """
  Preprocesses the text data.
  Args:
    x: The text data.
  Returns:
    The preprocessed text data.
  """
  # Make text lower case
  x = x.lower()

  # Remove tags of other people
  x = re.sub(r"@\w*", " ", x)

  # Remove special characters
  x = re.sub(r"#|^\*|\*$|&quot;|&gt;|&lt;|&lt;3", " ", x)
  x = x.replace("&amp;", " and ")

  # Remove links
  x = re.sub(r"ht+p+s?://\S*", " ", x)

  # Remove non-ascii
  x = re.sub(r"[^\x00-\x7F]", " ", x)

  # Remove time
  x = re.sub(r"((a|p).?m)?\s?(\d+(:|.)?\d+)\s?((a|p).?m)?", " ", x)

  # Remove brackets if left after removing time
  x = re.sub(r"\(\)|\[\]|\{\}", " ", x)

  # For words we want to keep at least two occurences of
  #  each word(e.g not change good to god)
  # x = re.sub(r"([a-z])\1+", r"\1\1", x)

  # Remove any string that starts with number
  x = re.sub(r"\d[\w]*", " ", x)

  # Remove all special characters left
  x = re.sub(r"[^a-zA-Z0-9 ]", "", x)

  # Remove single letters that left except i and a
  x = re.sub(r"\s[b-gj-z]\s", " ", x)

  # Remove multiple space chars
  x = " ".join(x.split()).strip()

  return x

In [6]:
"""Managing All Data Operation"""

from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
  def __init__(self, data: pd.DataFrame):
    self.data = data
    self.texts = list(data.clean_text)
    self.labels = list(data.is_depression)

  def __len__(self):
    """Denotes the total number of samples"""
    return len(self.data)

  def __getitem__(self, index):
    return {'text': self.texts[index],
            'label': self.labels[index]}

################################################################################

from tqdm.auto import tqdm
tqdm.pandas()

class CustomDataModule(pl.LightningDataModule):
  def __init__(self,
                batch_size: int = 32,
                split: float = 0.1,
                recreate_data: bool = False):
    super().__init__()

    self.batch_size = batch_size
    self.split = split
    self.recreate_data = recreate_data
    self.has_setup = False

  def prepare_data(self):
    if self.recreate_data or not check_if_exists(PROCESSED_DATA_DIR) or os.listdir(PROCESSED_DATA_DIR) == 0:
      make_dataset()

  def setup(self, stage: str):
    """
    Assign train/val datasets for use in dataloaders
    """
    if stage == "fit":
      if not self.has_setup:
        self.has_setup = True

        data = self.process_data(stage)
        # split data
        self.train_data, self.val_data = train_test_split(data, test_size=self.split, random_state=0)

        self.train_dataset = CustomDataset(self.train_data)
        self.val_dataset = CustomDataset(self.val_data)

      if stage in (None, "test", "predict"):
        self.test_data = self.process_data(stage)
        self.test_dataset = CustomDataset(self.test_data)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size=self.batch_size)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size)

  def predict_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size)

  def process_data(self, stage: str):
    """
    Help setup process by providing required data after preprocessing it
    """
    file_name = None

    if stage == "fit":
      file_name = "train.csv"

    if stage in (None, "test", "predict"):
      file_name = "test.csv"

    if file_name is None:
      raise ValueError(f"Stage {stage} is not valid.")

    data = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, file_name))
    print("process_Data runned with data shape", data.shape)
    data["clean_text"] = data.clean_text.progress_apply(lambda x: preprocess(x))
    return data

## 2. Building Model

In [200]:

import torch
import torch.nn.functional as F
from torch import nn

class FCL(nn.Module):
  def __init__( self, embeddings_matrix: torch.Tensor, ):
    super().__init__()
    self.embeddings_matrix = embeddings_matrix.float()
    self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embeddings_matrix))
    self.embedding_dim = embeddings_matrix.shape[1]
    # self.linear1_length = 0

    self.cnn1 = nn.Conv1d(in_channels=self.embedding_dim, out_channels=128, kernel_size=3)
        # nn.ReLU(),
    self.pool = nn.MaxPool1d(kernel_size= 2)
    self.drop = nn.Dropout(0.25)
    self.cnn2 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3)
        # nn.ReLU(),
        # nn.MaxPool1d(kernel_size= 2),
        # nn.Flatten(),
    self.fc1 = None
        # nn.ReLU(),
    self.lstm = nn.LSTM(input_size=64, hidden_size=64, num_layers=1, batch_first=True)

    self.fc2 = nn.Linear(64, 1)
    self.relu = nn.ReLU()

  def forward(self, x):
    seq, mask, lengths = x[0], x[1], x[2]
    # print("before", seq.shape)
    emb_seq = self.embedding(seq)
    # print("after", emb_seq.shape)
    emb_seq = emb_seq.transpose(1, 2)
    # print("after transpose", emb_seq.shape)
    # self.linear1_length = lengths
    x = self.relu(self.cnn1(emb_seq))
    x = self.drop(self.pool(x))
    x = self.relu(self.cnn2(x))
    x = self.pool(x)

    x_flattened_size = x.size(1) * x.size(2)
    if self.fc1 is None:
      self.fc1 = nn.Linear(x_flattened_size, 64)

    x = x.view(x.size(0), -1)
    x = self.relu(self.fc1(x))
    x, (hidden, _)= self.lstm(x)
    x = self.relu(x)
    return self.fc2(x)


  @property
  def name(self):
    return self.__class__.__name__


In [156]:
fcl_model = FCL(embeddings_matrix)

In [145]:
input2[0].shape

torch.Size([32, 1000])

In [157]:
fcl_model.forward(input2)

before torch.Size([32, 1000])
after torch.Size([32, 1000, 300])
after transpose torch.Size([32, 300, 1000])


tensor([[-0.0679],
        [-0.0630],
        [-0.0615],
        [-0.0610],
        [-0.0611],
        [-0.0613],
        [-0.0613],
        [-0.0614],
        [-0.0614],
        [-0.0614],
        [-0.0615],
        [-0.0614],
        [-0.0613],
        [-0.0614],
        [-0.0615],
        [-0.0613],
        [-0.0613],
        [-0.0610],
        [-0.0613],
        [-0.0612],
        [-0.0615],
        [-0.0614],
        [-0.0614],
        [-0.0612],
        [-0.0613],
        [-0.0612],
        [-0.0612],
        [-0.0611],
        [-0.0612],
        [-0.0611],
        [-0.0615],
        [-0.0614]], grad_fn=<AddmmBackward0>)

In [57]:
input[0]

tensor([[   2,   62,  404,  ...,    0,    0,    0],
        [   2,   64,  136,  ...,    0,    0,    0],
        [   2,   64,  172,  ...,    0,    0,    0],
        ...,
        [   2,   41,  125,  ...,   10, 1396,    3],
        [   2,    1,  584,  ...,    0,    0,    0],
        [   2,    1,    1,  ...,    0,    0,    0]])

NameError: name 'fcl_model' is not defined

In [163]:
import torch
from torchtext.data import get_tokenizer
from tqdm import tqdm
import json


class Tokenizer:
    def __init__(self):
        self.vocab = {}
        self.inv_vocab = {}
        self.tokenizer = get_tokenizer('basic_english')
        self.special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

    def fit_on_texts_and_embeddings(self, sentences: list[str], embeddings):
        # Add special tokens to the start of the vocab
        for special_token in self.special_tokens:
            self.vocab[special_token] = len(self.vocab)

        # Add each unique word in the sentences to the vocab if there is also an embedding for it
        for sentence in tqdm(sentences):
            for word in self.tokenizer(sentence):
                if word not in self.vocab and word in embeddings.words:
                    self.vocab[word] = len(self.vocab)

        self.inv_vocab = {v: k for k, v in self.vocab.items()}

    # create an embbeding matrix from a set of pretrained embeddings based on the vocab
    def get_embeddings_matrix(self, embeddings):
        # Create a matrix of zeroes of the shape of the vocab size
        embeddings_matrix = torch.zeros((len(self.vocab), embeddings.get_dimension()))

        # For each word in the vocab get its index and add its embedding to the matrix
        for word, idx in self.vocab.items():
            if word in self.special_tokens:
                continue
            if word in embeddings.words:
                embeddings_matrix[idx] = torch.tensor(embeddings.get_word_vector(word))
            else:
                raise KeyError(f"Word {word} not in embeddings. Please create tokenizer based on embeddings")

        # Initialize the <pad> token with the mean of the embeddings of the vocab
        embeddings_matrix[1] = torch.mean(embeddings_matrix[len(self.special_tokens):], dim=0)

        # Initialize the <sos> and <eos> tokens with the mean of the embeddings of the vocab
        # plus or minus a small amount of noise to avoid them matching the <unk> token
        # and avoiding having identical embeddings which the model can not distinguish
        noise = torch.normal(mean=0, std=0.1, size=(embeddings.get_dimension(),))
        embeddings_matrix[2] = torch.mean(embeddings_matrix[len(self.special_tokens):] + noise, dim=0)
        embeddings_matrix[3] = torch.mean(embeddings_matrix[len(self.special_tokens):] - noise, dim=0)

        return embeddings_matrix

    # add start of sentence and end of sentence tokens to the tokenizer sentence
    def add_special_tokens(self, tokens):
        return ["<sos>"] + tokens + ["<eos>"]

    # convert a sequence of words to a sequence of indices based on the vocab
    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(token, self.vocab['<unk>']) for token in tokens]


    def pad_sequences(self, sequences, max_length=None):
        # Pads the vectorized sequences

        # If max_length is not specified, pad to the length of the longest sequence
        if not max_length:
            max_length = max(len(seq) for seq in sequences)

        # Create a tensor for the lengths of the sequences
        sequence_lengths = torch.LongTensor([min(len(seq), max_length) for seq in sequences])

        # Create a tensor for the sequences with zeros
        seq_tensor = torch.zeros((len(sequences), max_length)).long()

        # Create a tensor for the masks with zeros
        seq_mask = torch.zeros((len(sequences), max_length)).long()

        # For each sequence add the values to the seq_tensor
        #  and add 1s to the seq_mask according to its length
        for idx, (seq, seq_len) in enumerate(zip(sequences, sequence_lengths)):
            # truncate the sequence if it exceeds the max length
            seq = seq[:seq_len]

            seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
            seq_mask[idx, :seq_len] = torch.LongTensor([1])

        return seq_tensor, seq_mask, sequence_lengths

    # split the text into tokens
    def tokenize(self, text):
        return self.tokenizer(text)

    def encode(self, texts, max_length=None):
        if isinstance(texts, str):
            texts = [texts]

        sequences = []
        for text in texts:
            tokens = self.tokenize(text)
            # print("first line:", tokens)
            tokens = self.add_special_tokens(tokens)
            # print("second line:", tokens)
            ids = self.convert_tokens_to_ids(tokens)
            # print("third line:", ids)
            sequences.append(ids)

        seq_tensor, seq_mask, sequence_lengths = self.pad_sequences(sequences, max_length)

        return seq_tensor, seq_mask, sequence_lengths

    # save the tokenizer to a json file
    def save(self, file_path: str, filename: str = "tokenizer.json"):
        if not os.path.exists(file_path):
            os.makedirs(file_path, exist_ok=True)
        json_data = {}
        with open(os.path.join(file_path, filename), 'w') as tokenizer_file:
            json_data["vocab"] = self.vocab
            json_data["inv_vocab"] = self.inv_vocab
            json_data["special_tokens"] = self.special_tokens
            json.dump(json_data, tokenizer_file)
            # logger.info(f"Successfully saved tokenizer {os.path.join(file_path, filename)}")

    # load the tokenizer from a json file
    def load(self, file_path: str, filename: str = "tokenizer.json"):
        if os.path.exists(file_path):
            with open(os.path.join(file_path, filename)) as tokenizer_file:
                json_data = json.load(tokenizer_file)
                self.vocab = json_data["vocab"]
                self.inv_vocab = json_data["inv_vocab"]
                self.special_tokens = json_data["special_tokens"]
                # logger.info(f"Successfully loaded tokenizer from {os.path.join(file_path, filename)}")
        else:
            raise FileNotFoundError("The file path does not exist")

    def __call__(self, texts, max_length=None):
        return self.encode(texts, max_length = 1000)

In [8]:
import fasttext
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="facebook/fasttext-et-vectors", filename="model.bin")
pretrained_embeddings = fasttext.load_model(model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.bin:   0%|          | 0.00/7.24G [00:00<?, ?B/s]

In [None]:
train_load = textDataMoudle.train_dataloader()
cnt = 0
sam = None
for batch in train_load:
    cnt = cnt+1
    texts = batch['text']
    labels = batch['label']
    sam = texts
    # print(texts[0])
    # print(labels[0])
    break
# print("numberr of lines", cnt)

In [None]:
pretrained_embeddings = model
tokenizer = Tokenizer()
# tokenizer.fit_on_texts_and_embeddings(textDataMoudle.train_data.clean_text, pretrained_embeddings)
# tokenizer.save("/content/drive/MyDrive/depression_classification/")

In [None]:
tokenizer.load("/content/drive/MyDrive/depression_classification/")
embeddings_matrix = tokenizer.get_embeddings_matrix(pretrained_embeddings)

In [None]:
print(sam)
print(len(sam))
it = tokenizer(sam)
print(type(it))
print(len(it))
print(it[0].shape)
print(it[1].shape)
print(it[2].shape)

['i can tell you how many time i ve hoped for someone to show up to a place i at and shoot me maybe then i ll make it onto the news maybe then someone will give a damn maybe', 'idk what it is with depression but it always make you feel like the bad guy me breath me right after what a po always breathing wrong', 'went to the doctor today and my blood sugar level wa', 'my night went to the bar felt up a marred woman went home hard and alone', 'poor sock luvvvvv the golden retriever i want one sighhhh', 'ok wonder why twitpix isn an option for this new phone i got i can win i ll leave the photo to my cuz i guess', 'pilvlp my luck i probably get stopped by a cop or something stupid', 'imohumoren that where i have a problem not knowing who is who till i got con of usd con another neighbour of sold the gen in my family house now have till april end before i get kicked out all i have now is depression desperation without smoking', 'wwwicked i think i have tried everything but feel free to try

In [9]:
  textDataMoudle = CustomDataModule()
  textDataMoudle.prepare_data()
  textDataMoudle.setup(stage = "fit")

process_Data runned with data shape (3711, 2)


  0%|          | 0/3711 [00:00<?, ?it/s]

In [10]:
train_loader = textDataMoudle.train_dataloader()

In [18]:
batch_data = next(iter(train_loader))
print(batch_data)
print("Test data shape:", len(batch_data))

{'text': ['not sure if actually anxiety related tbh recently during spring break my stress got the best of me and i ended up isolating myself and feeling unempathetic towards those who tried to contact me so i gave up on trying to express my feeling over text and wa really struggling with messaging and stuff today i realized that i wa having trouble getting any word out while speaking to those at school and my best friend yet i could talk to my family fine it making my friend uncomfortable and i believe they are upset with me yet i can even get my word out over text not sure what to expect putting this out there hoping for any explanation or help', 'it one thing to feel bad mentally but when i constantly feeling physically ill a well it make trying to get myself to do positive thing so much harder i can never just feel okay not mentally or physically always in some sort of pain and always feeling like shit then when you ask for help you just get out on a month waiting list', 'it been f

In [22]:
print(len(batch_data['text']))
for sentence in batch_data['text'] :
  print(len(sentence))

32
643
329
993
517
47
739
18
66
81
2586
56
54
45
57
132
962
220
473
65
907
125
53
166
1166
373
93
423
791
1217
7348
86
36


In [72]:
embeddings_matrix

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-8.7387e-03,  7.2767e-03,  2.4668e-02,  ..., -1.2901e-02,
         -7.5833e-03,  1.1581e-03],
        [-5.6565e-02, -1.6594e-01, -1.1204e-03,  ...,  1.1236e-01,
         -1.0972e-01,  2.1834e-01],
        ...,
        [-1.9998e-02,  1.4106e-02,  4.4227e-02,  ..., -1.6429e-02,
         -1.4284e-02, -2.7754e-02],
        [ 4.5684e-02,  2.3541e-02,  4.2600e-02,  ..., -1.3993e-02,
          9.2478e-05, -2.8274e-03],
        [-8.8838e-03, -2.7286e-03,  2.9893e-02,  ...,  5.9561e-03,
         -6.4476e-02,  3.0811e-02]])

In [133]:
  tokenizer = Tokenizer()
  # tokenizer.fit_on_texts_and_embeddings(textDataMoudle.train_data.clean_text, pretrained_embeddings)
  tokenizer.load("/content/drive/MyDrive/depression_classification/")
  # embeddings_matrix = tokenizer.get_embeddings_matrix(pretrained_embeddings)  # already created

In [134]:
tokenzied_batch_data = tokenizer(batch_data['text'])

first line: ['not', 'sure', 'if', 'actually', 'anxiety', 'related', 'tbh', 'recently', 'during', 'spring', 'break', 'my', 'stress', 'got', 'the', 'best', 'of', 'me', 'and', 'i', 'ended', 'up', 'isolating', 'myself', 'and', 'feeling', 'unempathetic', 'towards', 'those', 'who', 'tried', 'to', 'contact', 'me', 'so', 'i', 'gave', 'up', 'on', 'trying', 'to', 'express', 'my', 'feeling', 'over', 'text', 'and', 'wa', 'really', 'struggling', 'with', 'messaging', 'and', 'stuff', 'today', 'i', 'realized', 'that', 'i', 'wa', 'having', 'trouble', 'getting', 'any', 'word', 'out', 'while', 'speaking', 'to', 'those', 'at', 'school', 'and', 'my', 'best', 'friend', 'yet', 'i', 'could', 'talk', 'to', 'my', 'family', 'fine', 'it', 'making', 'my', 'friend', 'uncomfortable', 'and', 'i', 'believe', 'they', 'are', 'upset', 'with', 'me', 'yet', 'i', 'can', 'even', 'get', 'my', 'word', 'out', 'over', 'text', 'not', 'sure', 'what', 'to', 'expect', 'putting', 'this', 'out', 'there', 'hoping', 'for', 'any', 'expla

In [135]:
input = tokenzied_batch_data
input

(tensor([[  2,  62, 404,  ...,   0,   0,   0],
         [  2,  64, 136,  ...,   0,   0,   0],
         [  2,  64, 172,  ...,   0,   0,   0],
         ...,
         [  2,  41, 125,  ...,  40,  90,   1],
         [  2,   1, 584,  ...,   0,   0,   0],
         [  2,   1,   1,  ...,   0,   0,   0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([ 124,   67,  206,  109,   11,  150,    5,   17,   15,  544,   16,   12,
            7,   15,   25,  195,   41,   97,   14,  193,   26,   13,   36,  234,
           72,   19,   92,  158,  259, 1000,   20,    9]))

In [136]:
my_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [137]:
input2 = tuple([i.to(my_device) for i in input])

In [140]:
input2[0].shape

torch.Size([32, 1000])

In [36]:
for tokenzied_sentence in  tokenzied_batch_data :
  print((tokenzied_sentence.shape))

torch.Size([32, 1542])
torch.Size([32, 1542])
torch.Size([32])


In [185]:
from torch.optim import Adam
from torchmetrics.classification import Accuracy

class DepressionDetection(pl.LightningModule):
  def __init__(self,
               model: nn.Module,
               tokenizer: Tokenizer,
               learning_rate= 0.01,
               ):
    super().__init__()
    self.model = model
    self.tokenizer = tokenizer
    self.learning_rate = learning_rate
    self.criterion = nn.CrossEntropyLoss()
    self.train_accuracy = Accuracy(task='binary')
    self.val_accuracy = Accuracy(task='binary')
    self.test_accuracy = Accuracy(task='binary')
    self.my_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.model.to(self.my_device)

  def forward(self, x):
    input = self.tokenizer(x)
    input = tuple([i.to(self.device) for i in input])
    return self.model(input).view(-1)

  def training_step(self, batch, batch_idx):
    # training_step defines the train loop.
    x, y = batch["text"], batch["label"]

    pred = self(x)
    loss = self.criterion(pred, y.float())

    self.train_accuracy(pred, y)

    self.log('train_acc', self.train_accuracy, on_step=True, on_epoch=True, prog_bar=True)
    self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)

    return loss

  def validation_step(self, batch, batch_idx):
    # this is the validation loop
    self._shared_eval(batch, batch_idx, "val")

  def test_step(self, batch, batch_idx):
    # this is the test loop
    self._shared_eval(batch, batch_idx, "test")

  def _shared_eval(self, batch, batch_idx, prefix):
    x, y = batch["text"], batch["label"]

    pred = self(x)
    loss = self.criterion(pred, y.float())

    if prefix == "val":
      self.val_accuracy(pred, y)
      self.log(f"{prefix}_acc", self.val_accuracy, on_step=True, on_epoch=True, prog_bar=True)

    if prefix == "test":
      self.test_accuracy(pred, y)
      self.log(f"{prefix}_acc", self.test_accuracy, on_step=True, on_epoch=True, prog_bar=True)

    self.log(f"{prefix}_loss", loss, on_step=True, on_epoch=True, prog_bar=True)

  def predict_step(self, batch, batch_idx, dataloader_idx=0):
    x, _ = batch["text"], batch["label"]

    pred = self(x)
    pred = torch.sigmoid(pred)
    return (pred > 0.5).int()

  def configure_optimizers(self):
    optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
    return optimizer


In [201]:
import os
import sys
from logging import config
from pytorch_lightning.callbacks import TQDMProgressBar, EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

tqdm.pandas(file=sys.stdout)


def get_callbacks():
    tqdm_callback = TQDMProgressBar(refresh_rate=1)
    checkpoint_callback = ModelCheckpoint(
                                          dirpath='/content/drive/MyDrive/depression_classification/exp',
                                          save_last=True, save_top_k=1,
                                          filename="best-loss-model-{epoch:02d}-{val_loss:.2f}",
                                          monitor="val_loss",
                                          mode="min")
    checkpoint_callback.CHECKPOINT_NAME_LAST = "last-model-{epoch:02d}-{val_loss:.2f}"
    early_stopping_callback = EarlyStopping(monitor="val_loss", mode="min", patience=5, verbose=True)

    return [tqdm_callback, checkpoint_callback, early_stopping_callback]

In [202]:
from pytorch_lightning import Trainer

EXPERIMENTS_DIR = "/content/drive/MyDrive/depression_classification/exp"

def train():

  textDataMoudle = CustomDataModule()
  textDataMoudle.prepare_data()
  textDataMoudle.setup(stage = "fit")

  # pretrained_embeddings already loaded
  #TODO: will implement inside it

  tokenizer = Tokenizer()
  # tokenizer.fit_on_texts_and_embeddings(textDataMoudle.train_data.clean_text, pretrained_embeddings)
  tokenizer.load("/content/drive/MyDrive/depression_classification/")
  # embeddings_matrix = tokenizer.get_embeddings_matrix(pretrained_embeddings)  # already created


  model = FCL(embeddings_matrix)
  depression_detection = DepressionDetection(model=model, tokenizer=tokenizer)

  tb_logger = TensorBoardLogger(EXPERIMENTS_DIR, name=model.name)
  output_dir = os.path.join(EXPERIMENTS_DIR, model.name, f"version_{tb_logger.version}")
  # tokenizer.save(output_dir)

  trainer = Trainer(devices="auto",
                      accelerator="auto",
                      max_epochs=3,
                      callbacks=get_callbacks(),
                      logger=tb_logger,
                      log_every_n_steps=1
                      )

  trainer.fit(model=depression_detection, datamodule=textDataMoudle)
  trainer.predict(model=depression_detection, datamodule=textDataMoudle)



In [203]:
train()

process_Data runned with data shape (3711, 2)
100%|██████████| 3711/3711 [00:00<00:00, 6068.11it/s]


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:652: Checkpoint directory /content/drive/MyDrive/depression_classification/exp exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type             | Params | Mode 
------------------------------------------------------------
0 | model          | FCL              | 2.2 M  | train
1 | criterion      | CrossEntropyLoss | 0      | train
2 | train_accuracy | BinaryAccuracy   | 0      | train
3 | val_accuracy   | BinaryAccuracy   | 0      | train
4 | test_accuracy  | BinaryAccuracy   | 0      | train
------------------------------------------------------------
173 K     Trainable params
2.0 M     Non-trainable params
2.2 M     

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved. New best score: 50.308


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.420 >= min_delta = 0.0. New best score: 49.888


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.954 >= min_delta = 0.0. New best score: 48.934
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [172]:
EXPERIMENTS_DIR = "/content/drive/MyDrive/depression_classification/exp"

In [None]:
trainer.fit(model=depression_detection, datamodule=textDataMoudle)

In [206]:
model

NameError: name 'model' is not defined

In [204]:
import importlib
import logging
import os
import pkgutil
logger = logging.getLogger(__name__)
# from ml.utils.helpers import load_model_at_version

def load_model_at_version(model_name: str, version: str, model_type: str = "best-loss"):
    version_path = "/content/drive/MyDrive/depression_classification/exp/FCL/version_35"
    model_path = "/content/drive/MyDrive/depression_classification/exp/FCL"
    checkpoint = torch.load(model_path, map_location="cuda" if torch.cuda.is_available() else "cpu")

    # model_class = get_model_by_name(model_name)
    # model = model

    tokenizer = Tokenizer()
    tokenizer.load(file_path=version_path)

    # embeddings_matrix = checkpoint["state_dict"]["model.embedding.weight"].cpu()
    model = FCL(embeddings_matrix)

    system = DepressionDetection.load_from_checkpoint(
        checkpoint_path=model_path,
        model=model,
        tokenizer=tokenizer,
        strict=False,
        map_location="cuda" if torch.cuda.is_available() else "cpu",
    )
    logger.info(f"Successfully loaded {model_name} from {version}")

    return system
def get_model_at_version(model_name: str, version: str, model_type: str = "best-loss"):
    if not os.path.exists(os.path.join(EXPERIMENTS_DIR, model_name)):
        raise ValueError(
            f"Model {model_name} does not exist. Please select an existing model or train a new one \
            with `python train.py`"
        )

    if not os.path.exists(os.path.join(EXPERIMENTS_DIR, model_name, version)):
        raise ValueError(f"Version {version} does not exist. Please select a valid version")

    version_path = os.path.join(EXPERIMENTS_DIR, model_name, version)
    checkpoint_path = os.path.join(version_path, "checkpoints")
    model_path = None
    found = False
    for file in os.listdir(checkpoint_path):
        model_path = os.path.join(checkpoint_path, file)
        if model_type in file:
            model_path = os.path.join(checkpoint_path, file)
            found = True
            break

    if not found:
        logger.warning(f"Could not find a model checkpoint with type {model_type} in {checkpoint_path}")

    logger.info(f"Loading model from path {model_path} and from version {version_path}")
    return version_path, model_path


# get a model class by its class name
def get_model_by_name(class_name: str):
    # models_module = "ml.models"
    # parent_module = importlib.import_module(models_module)

    # for loader, name, is_pkg in pkgutil.walk_packages(parent_module.__path__):
    #     if not is_pkg:
    #         try:
    #             module = importlib.import_module(f"{models_module}.{name}")
    #             model = getattr(module, class_name, None)
    #             if model is not None:
    #                 return model
    #         except (ImportError, AttributeError):
    #             continue
    # raise ImportError(f"Cannot find class {class_name} in module {models_module}")


# get a trained model and its label encoder from a specific version of a model

def test(model_name: str, version: str, model_type: str = "best-loss"):
    # Load a model from a specific version and checkpoint
    system = load_model_at_version(model_name, version, model_type)

    data_module = DepressionDetection()

    tb_logger = TensorBoardLogger(EXPERIMENTS_DIR, name=model_name, version=version)
    trainer = Trainer(logger=tb_logger)
    trainer.test(system, datamodule=data_module)
    predictions = trainer.predict(system, datamodule=data_module)

    results = pd.DataFrame(
        {'predictions': torch.cat(predictions).tolist(), 'labels': data_module.test_data["target"]}
    )

In [207]:
checkpoint = "/content/drive/MyDrive/depression_classification/exp/FCL/version_35/checkpoints/best-loss-model-epoch=02-val_loss=48.70.ckpt"
trainer = pl.Trainer()

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [214]:
model = FCL(embeddings_matrix)
tokenizer = Tokenizer()
tokenizer.load("/content/drive/MyDrive/depression_classification/")

In [215]:
loaded_model = DepressionDetection.load_from_checkpoint(
    checkpoint_path=checkpoint,
    model=model,
    tokenizer=tokenizer,
)
prediction = trainer.predict(loaded_model, datamodule=CustomDataModule.predict_dataloader())

RuntimeError: Error(s) in loading state_dict for DepressionDetection:
	Unexpected key(s) in state_dict: "model.fc1.weight", "model.fc1.bias". 

In [196]:
trainer.predict()

TypeError: DepressionDetection.__init__() missing 2 required positional arguments: 'model' and 'tokenizer'