In [1]:
import pandas as pd
# Load dataset with specified encoding
try:
    df = pd.read_csv('Downloads/hatspeech dataset.csv', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv('Downloads/hatspeech dataset.csv', encoding='latin1')

# General dataset information
print("Dataset Info:\n")
print(df.info())

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  41144 non-null  object
 1   label    41144 non-null  object
dtypes: object(2)
memory usage: 643.0+ KB
None


In [2]:
# Display first few rows of the dataset
print("\nFirst few rows of the dataset:\n")
print(df.head())


First few rows of the dataset:

                                             comment label
0                                dalits are lowlives     N
1             gay people are a burden to our society     N
2                              Arabs are not welcome     N
3  I'm not saying we should actually eliminate he...     N
4                       bananas are for black people     N


In [4]:
# Check for duplicates
duplicates = df.duplicated().sum()
print("\nNumber of duplicate rows:", duplicates)


Number of duplicate rows: 10


In [6]:
# Display duplicate rows
if duplicates > 0:
    print("\nduplicate rows:\n")
    print(df[df.duplicated()].head())


duplicate rows:

                        comment label
41047  Blacks are such pondlife     N
41048  Blacks are such pondlife     N
41051  Blacks are such pondlife     N
41055  Blacks are such pondlife     N
41094    Muslims are a disgrace     N


In [7]:
# Remove duplicates
df = df.drop_duplicates()
print("\nDuplicates removed. Number of rows now:", len(df))


Duplicates removed. Number of rows now: 41134


In [8]:
# Check for null values
null_values = df.isnull().sum()
print("\nNull values in each column:\n", null_values)


Null values in each column:
 comment    0
label      0
dtype: int64


In [10]:
# Statistical summary of the dataset
print("\nStatistical Summary:\n")
print(df.describe(include='all'))


Statistical Summary:

                    comment  label
count                 41134  41134
unique                41134      3
top     dalits are lowlives      N
freq                      1  22151


In [12]:
# Display column names
print("\nColumn Names:\n", df.columns)


Column Names:
 Index(['comment', 'label'], dtype='object')


In [13]:
# Check data types of each column
print("\nData Types:\n", df.dtypes)


Data Types:
 comment    object
label      object
dtype: object


In [14]:
# Distribution of hate/non-hate labels
label_counts = df['label'].value_counts()
print("\nDistribution of Labels:\n", label_counts)


Distribution of Labels:
 label
N    22151
P    18947
O       36
Name: count, dtype: int64



First few hate comments:
 Empty DataFrame
Columns: [comment, label]
Index: []

First few non-hate comments:
 Empty DataFrame
Columns: [comment, label]
Index: []


In [28]:
original_negatives_df = df[df['label'] == 'P']
original_negatives_df.shape

(18947, 2)

In [29]:
original_positives_df = df[df['label'] == 'N']
original_positives_df.shape

(22151, 2)

In [30]:
from textblob import TextBlob

# Generate negated versions of the positive examples
neg_data = original_positives_df.copy()

neg_data['comment'] = neg_data['comment'].apply(lambda x: " ".join(["not " + w for w in x.split()]))

# Label the negated examples as negative (0)
neg_data['sentiment_label'] = 'P'

In [31]:
print(neg_data.shape)
neg_data.head(7)

(22151, 3)


Unnamed: 0,comment,label,sentiment_label
0,not dalits not are not lowlives,N,P
1,not gay not people not are not a not burden no...,N,P
2,not Arabs not are not not not welcome,N,P
3,not I'm not not not saying not we not should n...,N,P
4,not bananas not are not for not black not people,N,P
6,not women not can not not not reproduce not ki...,N,P
8,not Who not cares not what not Chinese not peo...,N,P


In [32]:
# Oversample 1-class and concat the DataFrames of both classes
df_positive_oversampled = original_positives_df.sample(18947, replace=True)
df_positive_oversampled.shape

(18947, 2)

In [33]:
df_negative_oversampled = pd.concat([neg_data,original_negatives_df],ignore_index =True)
df_negative_oversampled.shape

(41098, 3)

In [34]:
df_positive_oversampled = pd.concat([df_positive_oversampled,original_positives_df],ignore_index=True)
df_positive_oversampled.shape

(41098, 2)

In [35]:
# Combine the negated examples with the original positive examples
df_balanced = pd.concat([df_positive_oversampled, df_negative_oversampled], ignore_index=True)
df_balanced.sentiment_label.value_counts()

sentiment_label
P    22151
Name: count, dtype: int64