In [1]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [2]:
# enable tqdm in pandas
tqdm.pandas()

# # set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cpu
random seed: 1234


In [3]:
df = pd.read_csv("../data/SuicideAndDepression_Detection.csv")

df.head()

Unnamed: 0,text,class
0,Does life actually work for most / non-depress...,depression
1,I found my friend's bodyIt was almost nine yea...,depression
2,Ex Wife Threatening SuicideRecently I left my ...,SuicideWatch
3,Am I weird I don't get affected by compliments...,teenagers
4,Finally 2020 is almost over... So I can never ...,teenagers


In [4]:
pd.value_counts(df['class'])

  pd.value_counts(df['class'])


class
SuicideWatch    116037
teenagers       116037
depression      116036
Name: count, dtype: int64

In [5]:
print(df.loc[0, 'text'])

Does life actually work for most / non-depressed people?It doesn't seem possible to me that everyone isn't miserable. What do you think? My boyfriend told me the other week that in reality we are the minority. Most people are fine, if not happy. Oddball.


In [6]:
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace('\\', '', regex=False)
df['text'] = df['text'].str.replace(r'([?.!])([A-Z])', r'\1 \2', regex=True)

In [7]:
print(df.loc[0, 'text'])

does life actually work for most / non-depressed people?it doesn't seem possible to me that everyone isn't miserable. what do you think? my boyfriend told me the other week that in reality we are the minority. most people are fine, if not happy. oddball.


In [8]:
label_mapping = {
    "teenagers": 0,
    "depression": 1,
    "SuicideWatch": 2
}
df['class'] = df['class'].map(label_mapping)

In [9]:
nan_count = df['class'].isna().sum()
print(f"Number of NaN rows in 'class': {nan_count}")

Number of NaN rows in 'class': 14


In [10]:
df[df['class'].isna()]

Unnamed: 0,text,class
11557,i feel like im in a nightmare.something happen...,
11558,it's like i'm living in a nightmare and everyt...,
11559,(view post history for more info on my dad),
41048,a doodle of my struggle with depressionhttp://...,
47570,thinking of putting this as my profile picture...,
61160,if i told you i want to move on with my life a...,
141715,i think i might need someone to talk me down f...,
141716,i've known that i'll never get any love outsid...,
156657,a clip that describes how i feel when i'm tryi...,
156658,depression,


In [11]:
df = df.dropna(subset=['class']).reset_index(drop=True)

In [12]:
train_df, dev_df = train_test_split(df, train_size=0.8, random_state=seed, stratify=df['class'])
train_df.reset_index(inplace=True, drop=True)
dev_df.reset_index(inplace=True, drop=True)

print(f"Train rows: {len(train_df):,}, Dev rows: {len(dev_df):,}")

Train rows: 278,488, Dev rows: 69,622


In [13]:
train_df['text'] = train_df['text'].astype(str)
dev_df['text'] = dev_df['text'].astype(str)

In [16]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Example tokenization
train_encodings = tokenizer(
    train_df['text'].tolist(),
    truncation=True,
    padding=True,
    max_length=256
)
dev_encodings = tokenizer(
    dev_df['text'].tolist(),
    truncation=True,
    padding=True,
    max_length=256
)

# Print a sample of tokenized input
print("\nSample tokenized input:")
for key in train_encodings.keys():
    print(f"{key}: {train_encodings[key][0]}")  # first example


Sample tokenized input:
input_ids: [101, 1056, 2860, 2743, 2102, 1006, 1045, 2215, 2000, 2463, 2015, 1045, 1521, 1049, 2428, 2074, 3110, 2091, 1012, 2092, 1045, 2812, 2062, 2084, 2091, 1012, 2021, 2005, 1996, 2627, 2095, 2030, 2048, 2026, 6245, 2038, 5407, 2061, 2919, 1012, 1045, 1521, 2310, 2036, 2764, 2019, 5983, 8761, 1998, 10089, 1998, 2969, 19593, 3471, 1012, 2477, 2024, 2524, 1012, 2066, 2183, 2648, 2030, 3331, 2003, 1037, 5998, 1012, 4312, 2015, 1010, 2816, 2746, 2039, 1012, 1996, 5221, 2051, 1997, 1996, 2095, 2073, 2026, 6245, 1998, 10089, 4152, 13330, 1012, 2339, 1029, 1045, 2123, 1521, 1056, 2113, 1012, 1045, 1521, 1049, 1037, 2711, 2040, 7459, 3086, 1010, 2025, 2469, 2339, 1007, 1012, 2021, 13718, 1045, 2196, 2428, 2288, 2009, 1012, 4406, 2060, 3057, 1999, 2026, 3694, 1010, 2027, 2106, 1012, 1045, 3984, 2008, 2428, 13330, 2033, 1012, 1998, 2145, 2515, 1012, 1045, 2467, 2514, 2066, 1037, 10459, 4702, 1012, 8840, 2140, 4312, 2015, 13893, 2003, 1037, 4121, 9495, 2005, 2033, 10

In [18]:
import torch

class TransformerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = TransformerDataset(train_encodings, train_df['class'].tolist())
dev_dataset = TransformerDataset(dev_encodings, dev_df['class'].tolist())

print("\nSample from PyTorch dataset:")
sample = train_dataset[0]
for key in sample:
    if key != "labels":
        print(f"{key} shape: {sample[key].shape}, dtype: {sample[key].dtype}")
print(f"label: {sample['labels']}")


Sample from PyTorch dataset:
input_ids shape: torch.Size([256]), dtype: torch.int64
attention_mask shape: torch.Size([256]), dtype: torch.int64
label: 0.0
