## LOAD AND ANALYZE DATASET


In [66]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [67]:
# Load datasets
emotion_df = pd.read_csv("data/emotions_dataset.csv")
hate_df = pd.read_csv("data/hate_speech_dataset.csv")
violence_df = pd.read_csv("data/violence_dataset/violence_dataset.csv")

# Function to display dataset info
def display_dataset_info(name, df):
    print(f"\n{'='*20} {name.upper()} DATASET {'='*20}")
    print(f"\n👉 Shape: {df.shape}")
    print(f"\n👉 Columns: {df.columns.tolist()}")
    print(f"\n👉 Head:\n{df.head()}")

# Display info for each dataset
display_dataset_info("Emotion", emotion_df)
display_dataset_info("Violence", violence_df)
display_dataset_info("Hate Speech", hate_df)




👉 Shape: (416809, 3)

👉 Columns: ['Unnamed: 0', 'text', 'label']

👉 Head:
   Unnamed: 0                                               text  label
0           0      i just feel really helpless and heavy hearted      4
1           1  ive enjoyed being able to slouch about relax a...      0
2           2  i gave up my internship with the dmrg and am f...      4
3           3                         i dont know i feel so lost      0
4           4  i am a kindergarten teacher and i am thoroughl...      4


👉 Shape: (39650, 3)

👉 Columns: ['Tweet_ID', 'tweet', 'type']

👉 Head:
      Tweet_ID                                              tweet  \
0  ID_0022DWKP  Had a dream i got raped last night. By a guy i...   
1  ID_00395QYM  he thought the word raped means sex and told m...   
2  ID_003EOSSF  She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...   
3  ID_004BBHOD  I was sexually abused for 3 years at age 4 to ...   
4  ID_004F7516  Chessy Prout can do better by telling the trut...   

    

## DATA PREPROCESSING

In [68]:
# Dropping unwanted columns

emotion_df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')
violence_df.drop(columns=['Tweet_ID'], inplace=True, errors='ignore')
hate_df = hate_df[['tweet', 'class']] # Didnt use drop as classes to drop were many.

In [69]:
def display_dataset_info(name, df):
    print(f"\n{'='*20} {name.upper()} DATASET {'='*20}")
    print(f"\n👉 Shape: {df.shape}")
    print(f"\n👉 Columns: {df.columns.tolist()}")
    print(f"\n👉 Head:\n{df.head()}")

# Display info for each dataset
display_dataset_info("Emotion", emotion_df)
display_dataset_info("Violence", violence_df)
display_dataset_info("Hate Speech", hate_df)



👉 Shape: (416809, 2)

👉 Columns: ['text', 'label']

👉 Head:
                                                text  label
0      i just feel really helpless and heavy hearted      4
1  ive enjoyed being able to slouch about relax a...      0
2  i gave up my internship with the dmrg and am f...      4
3                         i dont know i feel so lost      0
4  i am a kindergarten teacher and i am thoroughl...      4


👉 Shape: (39650, 2)

👉 Columns: ['tweet', 'type']

👉 Head:
                                               tweet             type
0  Had a dream i got raped last night. By a guy i...  sexual_violence
1  he thought the word raped means sex and told m...  sexual_violence
2  She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...  sexual_violence
3  I was sexually abused for 3 years at age 4 to ...  sexual_violence
4  Chessy Prout can do better by telling the trut...  sexual_violence


👉 Shape: (24783, 2)

👉 Columns: ['tweet', 'class']

👉 Head:
                                   

We notice that column names of labels of all datasets are not consistent..

In [70]:
#Renaming the columns for consistency
#emotion_df already has 'text' and 'label' columns
hate_df.rename(columns={'tweet':'text', 'class':'label'}, inplace=True)  
violence_df.rename(columns={'tweet': 'text', 'type':'label'}, inplace=True)

In [71]:
print(emotion_df.columns)
print(violence_df.columns)
print(hate_df.columns)

Index(['text', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')


Checking for null values

In [72]:
emotion_df.isna().sum() , violence_df.isna().sum(), hate_df.isna().sum()

(text     0
 label    0
 dtype: int64,
 text     0
 label    0
 dtype: int64,
 text     0
 label    0
 dtype: int64)

Great!! No null values!!

In [73]:
emotion_df.shape, violence_df.shape, hate_df.shape

((416809, 2), (39650, 2), (24783, 2))

We can see that #rows are highly inconsistent here,

So we can select 20k rows from each dataset and build 3 new datasets.

This rows should be distributed equally among all classes.

In [74]:
emotion_df.label.value_counts(), violence_df.label.value_counts(), hate_df.label.value_counts()

(label
 1    141067
 0    121187
 3     57317
 4     47712
 2     34554
 5     14972
 Name: count, dtype: int64,
 label
 sexual_violence                 32648
 Physical_violence                5946
 emotional_violence                651
 economic_violence                 217
 Harmful_Traditional_practice      188
 Name: count, dtype: int64,
 label
 1    19190
 2     4163
 0     1430
 Name: count, dtype: int64)

1. Firstly for Emotions dataset


In [75]:
e_df = pd.DataFrame()

for i in emotion_df.label.unique():
    subset = emotion_df[emotion_df.label == i].sample(n=2000, random_state=42) #extracting random 2k rows from each class.
    e_df = pd.concat([e_df, subset], ignore_index=True)

In [76]:
e_df.shape

(12000, 2)

In [77]:
emotion_df = e_df.copy()
print(emotion_df.label.value_counts())

label
4    2000
0    2000
2    2000
1    2000
5    2000
3    2000
Name: count, dtype: int64


2. Secondly for Violence dataset

In [78]:
violence_df.label.value_counts()

label
sexual_violence                 32648
Physical_violence                5946
emotional_violence                651
economic_violence                 217
Harmful_Traditional_practice      188
Name: count, dtype: int64

Since there is a huge imbalance we need to balance it accordingly.

Since emotional_violence, economic_violence, and political_violence are low frequency classes, 

we will use all the data for these classes.

We will also keep all of physical_violence, 

and for sexual_violence we will keep 4998 random rows to get the total row count to 12000.

In [79]:
#creating new column for 'sexual_violence' to balance the dataset, and appending it to the existing violence_df
sexual_violence = violence_df[violence_df.label == 'sexual_violence'].sample(n=4998, random_state=42)
violence_df = pd.concat([violence_df[violence_df.label != 'sexual_violence'], sexual_violence], ignore_index=True)
print(violence_df.label.value_counts())

label
Physical_violence               5946
sexual_violence                 4998
emotional_violence               651
economic_violence                217
Harmful_Traditional_practice     188
Name: count, dtype: int64


In [80]:
violence_df.shape

(12000, 2)

3. Lastly for Hate dataframe

In [81]:
hate_df.label.value_counts()

label
1    19190
2     4163
0     1430
Name: count, dtype: int64

Here also we will take all of 2,0 classes and 

for 1st class we use 6407 rows to complete the dataset size of 12k

In [82]:
#creating new column for 1 to balance the dataset, and appending it to the existing hate_df

offensive_speech = hate_df[hate_df.label == 1].sample(n=6407, random_state=42)
hate_df = pd.concat([hate_df[hate_df.label !=1], offensive_speech], ignore_index=True)
print(hate_df.label.value_counts())

label
1    6407
2    4163
0    1430
Name: count, dtype: int64


In [83]:
emotion_df.shape, violence_df.shape, hate_df.shape

((12000, 2), (12000, 2), (12000, 2))

Now all our new df shapes are consistent

Since we have randomly selected our dataset, their indices are random as well.

So we need to reset the indices.

In [84]:
#Reset index for all datasets
emotion_df.reset_index(drop=True, inplace=True)
violence_df.reset_index(drop=True, inplace=True)
hate_df.reset_index(drop=True, inplace=True)

In [85]:
# Function to display dataset info
def display_dataset_info(name, df):
    print(f"\n{'='*20} {name.upper()} DATASET {'='*20}")
    # print(f"\n👉 Shape: {df.shape}")
    # print(f"\n👉 Columns: {df.columns.tolist()}")
    print(f"\n👉 Head:\n{df.head()}")

# Display info for each dataset
display_dataset_info("Emotion", emotion_df)
display_dataset_info("Violence", violence_df)
display_dataset_info("Hate Speech", hate_df)



👉 Head:
                                                text  label
0  i feel that it creates a suspicious environmen...      4
1               i feel reluctant asking for anything      4
2  i am afraid to really show what i feel because...      4
3  i think he feels a little helpless in all this...      4
4                          i certainly feel tortured      4


👉 Head:
                                                text              label
0  My Husband Beats Me Frequently, Wife Tells Cou...  Physical_violence
1  Best thing for me to do, is remain silent when...  Physical_violence
2  My husband will never beat me, Bambam denies r...  Physical_violence
3  theyre like, i just wanna be a baby maker with...  Physical_violence
4  I was in England for a week, the longest I’ve ...  Physical_violence


👉 Head:
                                                text  label
0  !!! RT @mayasolovely: As a woman you shouldn't...      2
1    " momma said no pussy cats inside my doghouse "      

Now the indices are consistent as well..

## LABEL ENCODING

We dont need encoding for Emotion and Hate Speech Datasets, We only need to do it for Violence dataset

In [86]:
label_encoder = LabelEncoder()
violence_df['label'] = label_encoder.fit_transform(violence_df['label'])

In [89]:
violence_df['label'].value_counts()

label
1    5946
4    4998
3     651
2     217
0     188
Name: count, dtype: int64

Now labels in Violence Dataset have been encoded.