In [63]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.utils import resample

In [64]:
df = pd.read_csv('train_data.csv')

In [65]:
df

Unnamed: 0,PID,Text data,Label
0,train_pid_1,Waiting for my mind to have a breakdown once t...,moderate
1,train_pid_2,My new years resolution : I'm gonna get my ass...,moderate
2,train_pid_3,New year : Somone else Feeling like 2020 will ...,moderate
3,train_pid_4,"My story I guess : Hi, Im from Germany and my ...",moderate
4,train_pid_5,Sat in the dark and cried myself going into th...,moderate
...,...,...,...
7196,train_pid_7197,Aren’t we all just tired? : I’ve been depresse...,severe
7197,train_pid_7198,NEED HELP COPING : I had my life pretty much f...,severe
7198,train_pid_7199,Qutting Zoloft Cold Turkey : I was on 75 mg se...,severe
7199,train_pid_7200,Crying : I’m coming off my antidepressants and...,severe


In [66]:
df.dropna(inplace=True)

In [68]:
df['Label'] = df['Label'].map({'not depression': 0, 'moderate': 1, 'severe': 2})
df

Unnamed: 0,PID,Text data,Label
0,train_pid_1,Waiting for my mind to have a breakdown once t...,1
1,train_pid_2,My new years resolution : I'm gonna get my ass...,1
2,train_pid_3,New year : Somone else Feeling like 2020 will ...,1
3,train_pid_4,"My story I guess : Hi, Im from Germany and my ...",1
4,train_pid_5,Sat in the dark and cried myself going into th...,1
...,...,...,...
7196,train_pid_7197,Aren’t we all just tired? : I’ve been depresse...,2
7197,train_pid_7198,NEED HELP COPING : I had my life pretty much f...,2
7198,train_pid_7199,Qutting Zoloft Cold Turkey : I was on 75 mg se...,2
7199,train_pid_7200,Crying : I’m coming off my antidepressants and...,2


In [70]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[@#]\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    text = ' '.join(filtered_tokens)
    return text

df['Text data'] = df['Text data'].apply(preprocess_text)

In [71]:
df

Unnamed: 0,PID,Text data,Label
0,train_pid_1,waiting mind breakdown new year feeling isnt a...,1
1,train_pid_2,new years resolution im gon na get ass therapi...,1
2,train_pid_3,new year somone else feeling like 2020 last ye...,1
3,train_pid_4,story guess hi im germany english mostly self ...,1
4,train_pid_5,sat dark cried going new year great start 2020,1
...,...,...,...
7196,train_pid_7197,arent tired ive depressed months lost trust pe...,2
7197,train_pid_7198,need help coping life pretty much figured toge...,2
7198,train_pid_7199,qutting zoloft cold turkey 75 mg sertraline 4 ...,2
7199,train_pid_7200,crying im coming antidepressants emotions comi...,2


In [73]:
df_minority1 = df[df['Label'] == 2]
df_minority2 = df[df['Label'] == 0]
df_majority = df[df['Label'] == 1]

In [74]:
df_minority1_upsampled = resample(df_minority1, replace=True, n_samples=len(df_majority), random_state=123)
df_minority2_upsampled = resample(df_minority2, replace=True, n_samples=len(df_majority), random_state=123)

In [75]:
df_upsampled = pd.concat([df_majority, df_minority1_upsampled, df_minority2_upsampled])

In [77]:
df_upsampled['Label'] = df_upsampled['Label'].map({0: 'not depression', 1: 'moderate', 2: 'severe'})

In [79]:
df_upsampled

Unnamed: 0,PID,Text data,Label
0,train_pid_1,waiting mind breakdown new year feeling isnt a...,moderate
1,train_pid_2,new years resolution im gon na get ass therapi...,moderate
2,train_pid_3,new year somone else feeling like 2020 last ye...,moderate
3,train_pid_4,story guess hi im germany english mostly self ...,moderate
4,train_pid_5,sat dark cried going new year great start 2020,moderate
...,...,...,...
5032,train_pid_5033,never want rely anyone lowkey always probably,not depression
5050,train_pid_5051,anyone else feel like every single week dumb s...,not depression
1721,train_pid_1722,well well im done cant believe im saying ive f...,not depression
5372,train_pid_5373,thinking maxing two credit cards buy gaming la...,not depression


In [80]:
df_upsampled.to_csv('preprocessed_train_data.csv', index=False)