In [1]:
import json
import os 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tqdm.notebook import tqdm
from datetime import datetime  
import glob
import re
from collections import Counter

sns.set(font_scale=1.4, style="white")

In [2]:
# load data
reddit_combi_df = pd.read_excel(r"D:\Stress-Detection\data\Reddit_Combi.xlsx", engine='openpyxl')
reddit_combi_df.head()
reddit_title_df = pd.read_excel(r"D:\Stress-Detection\data\Reddit_Title.xlsx", engine='openpyxl')
twitter_full_df = pd.read_excel(r"D:\Stress-Detection\data\Twitter_Full.xlsx", engine='openpyxl')
twitter_non_ad_df = pd.read_excel(r"D:\Stress-Detection\data\Twitter_Non-Advert.xlsx", engine='openpyxl')

print(reddit_title_df.shape)
print(reddit_combi_df.shape)
print(twitter_full_df.shape)
print(twitter_non_ad_df.shape)


(5556, 2)
(3123, 4)
(8900, 3)
(2051, 2)


In [3]:
print(reddit_title_df.info())
print(reddit_combi_df.info())
print(twitter_full_df.info())
print(twitter_non_ad_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5556 entries, 0 to 5555
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5556 non-null   object
 1   label   5556 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 86.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3123 entries, 0 to 3122
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       3123 non-null   object
 1   body        3116 non-null   object
 2   Body_Title  3123 non-null   object
 3   label       3123 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 97.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8900 entries, 0 to 8899
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      8900 non-null   object
 1   hashtags  8892 non-null   object
 2   labels    8900 non-null   int

In [4]:
reddit_combi_df.dropna(inplace=True)
twitter_full_df.dropna(inplace=True)            
print(reddit_combi_df.info())
print(twitter_full_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 3116 entries, 0 to 3122
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       3116 non-null   object
 1   body        3116 non-null   object
 2   Body_Title  3116 non-null   object
 3   label       3116 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 121.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 8892 entries, 0 to 8899
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      8892 non-null   object
 1   hashtags  8892 non-null   object
 2   labels    8892 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 277.9+ KB
None


In [5]:
# check for duplicates in each column of each dataframe
for df, name in zip([reddit_title_df, reddit_combi_df, twitter_full_df, twitter_non_ad_df],
                    ['Reddit Title', 'Reddit Combi', 'Twitter Full', 'Twitter Non-Ad']):
    print(f"Duplicates in {name} dataframe:")
    for col in df.columns:
        num_duplicates = df[col].duplicated().sum()
        print(f"  Column '{col}': {num_duplicates} duplicates")
    print()

Duplicates in Reddit Title dataframe:
  Column 'title': 34 duplicates
  Column 'label': 5554 duplicates

Duplicates in Reddit Combi dataframe:
  Column 'title': 25 duplicates
  Column 'body': 4 duplicates
  Column 'Body_Title': 0 duplicates
  Column 'label': 3114 duplicates

Duplicates in Twitter Full dataframe:
  Column 'text': 459 duplicates
  Column 'hashtags': 3322 duplicates
  Column 'labels': 8890 duplicates

Duplicates in Twitter Non-Ad dataframe:
  Column 'text': 79 duplicates
  Column 'label': 2049 duplicates



In [9]:
# remove duplicates in text columns, handle missing/empty values
reddit_title_df = reddit_title_df.drop_duplicates(subset=['title'])
reddit_combi_df = reddit_combi_df.drop_duplicates(subset=['title'])
reddit_combi_df = reddit_combi_df.drop_duplicates(subset=['body'])
twitter_full_df = twitter_full_df.drop_duplicates(subset=['text'])
twitter_non_ad_df = twitter_non_ad_df.drop_duplicates(subset=['text'])
print("After removing duplicates:")
print(reddit_title_df.shape)
print(reddit_combi_df.shape)
print(twitter_full_df.shape)
print(twitter_non_ad_df.shape)

for df, name in zip([reddit_title_df, reddit_combi_df, twitter_full_df, twitter_non_ad_df],
                    ['Reddit Title', 'Reddit Combi', 'Twitter Full', 'Twitter Non-Ad']):
    print(f"Duplicates in {name} dataframe:")
    for col in df.columns:
        num_duplicates = df[col].duplicated().sum()
        print(f"  Column '{col}': {num_duplicates} duplicates")
    print()

After removing duplicates:
(5513, 2)
(3080, 4)
(8410, 3)
(1966, 2)
Duplicates in Reddit Title dataframe:
  Column 'title': 0 duplicates
  Column 'label': 5511 duplicates

Duplicates in Reddit Combi dataframe:
  Column 'title': 0 duplicates
  Column 'body': 0 duplicates
  Column 'Body_Title': 0 duplicates
  Column 'label': 3078 duplicates

Duplicates in Twitter Full dataframe:
  Column 'text': 0 duplicates
  Column 'hashtags': 2967 duplicates
  Column 'labels': 8408 duplicates

Duplicates in Twitter Non-Ad dataframe:
  Column 'text': 0 duplicates
  Column 'label': 1964 duplicates



In [10]:
def pre_processing(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r"[^A-Za-z0-9#]+", ' ', text)
    text = re.sub(r'\n','', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

for df in [reddit_title_df, reddit_combi_df, twitter_full_df, twitter_non_ad_df]:
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(pre_processing)

In [None]:
print("After processing text:")
print(reddit_title_df.shape)
print(reddit_combi_df.shape)
print(twitter_full_df.shape)
print(twitter_non_ad_df.shape)

# processing did not remove any rows

After processing text:
(5513, 2)
(3080, 4)
(8410, 3)
(1966, 2)


In [None]:
import nltk