In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.utils import resample

In [2]:
df = pd.read_csv('dev_data.csv')

In [3]:
df

Unnamed: 0,Pid,text data,Class labels
0,dev_pid_1,Im scared : This is it. I lie to myself every ...,moderate
1,dev_pid_2,New to this but just wanted to vent : I just f...,moderate
2,dev_pid_3,I’m sad : It’s kinda always been an issue. I w...,moderate
3,dev_pid_4,Lonely but not alone. : All of my immediately ...,moderate
4,dev_pid_5,This year has been trash. : I dont know why I’...,moderate
...,...,...,...
3240,dev_pid_3241,"Feeling lonely. : Hi reddit, I haven’t posted ...",severe
3241,dev_pid_3242,When would suicide be right? : So I got back f...,severe
3242,dev_pid_3243,Lowest I’ve ever been ever. : To make a long s...,severe
3243,dev_pid_3244,Does the Toxoplasma Gondii ruined my life ? (f...,severe


In [4]:
df.dropna(inplace=True)

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[@#]\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    text = ' '.join(filtered_tokens)
    return text

df['text data'] = df['text data'].apply(preprocess_text)

In [6]:
df

Unnamed: 0,Pid,text data,Class labels
0,dev_pid_1,im scared lie every day say ill make think mig...,moderate
1,dev_pid_2,new wanted vent finally realized im kind bad m...,moderate
2,dev_pid_3,im sad kinda always issue wouldnt say bad peer...,moderate
3,dev_pid_4,lonely alone immediately family members dead d...,moderate
4,dev_pid_5,year trash dont know im posting dont even know...,moderate
...,...,...,...
3240,dev_pid_3241,feeling lonely hi reddit havent posted sub har...,severe
3241,dev_pid_3242,would suicide right got back hospital weeks ag...,severe
3242,dev_pid_3243,lowest ive ever ever make long story short mad...,severe
3243,dev_pid_3244,toxoplasma gondii ruined life first part ok go...,severe


In [7]:
df_minority1 = df[df['Class labels'] == 'severe']
df_minority2 = df[df['Class labels'] == 'not depression']
df_majority = df[df['Class labels'] == 'moderate']

In [8]:
df_minority1_upsampled = resample(df_minority1, replace=True, n_samples=len(df_majority), random_state=123)
df_minority2_upsampled = resample(df_minority2, replace=True, n_samples=len(df_majority), random_state=123)

In [9]:
df_upsampled = pd.concat([df_majority, df_minority1_upsampled, df_minority2_upsampled])

In [10]:
df_upsampled['Class labels'] = df_upsampled['Class labels']

In [11]:
df_upsampled

Unnamed: 0,Pid,text data,Class labels
0,dev_pid_1,im scared lie every day say ill make think mig...,moderate
1,dev_pid_2,new wanted vent finally realized im kind bad m...,moderate
2,dev_pid_3,im sad kinda always issue wouldnt say bad peer...,moderate
3,dev_pid_4,lonely alone immediately family members dead d...,moderate
4,dev_pid_5,year trash dont know im posting dont even know...,moderate
...,...,...,...
2513,dev_pid_2514,anyone feared theyll kill loved ones sleep uni...,not depression
2828,dev_pid_2829,finally got fulltime job feel even worse maybe...,not depression
2259,dev_pid_2260,deal boredomnothing satisfying anymore days ta...,not depression
2652,dev_pid_2653,got bed showered morning removed,not depression


In [12]:
df_upsampled.to_csv('preprocessed_dev_data.csv', index=False)