# Data Processing
This notebook processes raw/downloaded data and compiles it into our dataset

In [1]:
import pandas as pd
import numpy as np
from langdetect import detect

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
dataset = pd.DataFrame(columns=["text", "class"])

In [3]:
def analyze_text_column(df: pd.DataFrame, text_column = "text"):
    df_copy = df.copy()
    df_copy["chars"] = df[text_column].apply(len)
    return df_copy["chars"].describe()

# Load Raw Data
This section loads the data from existing datasets and merges it into our dataset

## 1. Hate Speech and Offensive Language
- *Source*: https://github.com/t-davidson/hate-speech-and-offensive-language/raw/master/data/labeled_data.csv
- *Datatype*: Tweets
- *Description*: The text is classified as: hate-speech, offensive language, and neither
- *Comments*: This dataset contains mainly hate speech data and isn't really defined on `neither` class
- *Merge process*:
    - `Inappropriate`: Offensive (1) or Hate Speech (0)
    - `Appropriate`: Neither (2)

In [4]:
hsaol = pd.read_csv("downloads/hsaol.csv", sep=",")[["tweet","class"]]

In [5]:
hsaol["class"].value_counts()

1    19190
2     4163
0     1430
Name: class, dtype: int64

In [6]:
hsaol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   24783 non-null  object
 1   class   24783 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 387.4+ KB


In [7]:
analyze_text_column(hsaol, "tweet")

count    24783.000000
mean        85.436065
std         41.548238
min          5.000000
25%         52.000000
50%         81.000000
75%        119.000000
max        754.000000
Name: chars, dtype: float64

In [8]:
hsaol["tweet"] = hsaol["tweet"].str.replace("^!*", "")
hsaol["tweet"] = hsaol["tweet"].str.replace("\sRT\s@.*:", " ")
hsaol["inappro"] = (hsaol["class"] < 2).astype(int)

hsaol = hsaol[["tweet", "inappro"]].rename(columns={'tweet': 'text', 'inappro': 'class'})

  hsaol["tweet"] = hsaol["tweet"].str.replace("^!*", "")
  hsaol["tweet"] = hsaol["tweet"].str.replace("\sRT\s@.*:", " ")


In [9]:
dataset = pd.concat([dataset, hsaol], ignore_index=True)

## 2. Measuring Hate Speech
- *Source*: https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speechv
- *Datatype*: Social media comments (Twitter, Reddit, Youtube)
- *Description*: 10 ordinal labels (sentiment, (dis)respect, insult, humiliation, inferior status, violence, dehumanization, genocide, attack/defense, hate speech), which are debiased and aggregated into a continuous hate speech severity score (hate_speech_score)
- *Comments*: This dataset contains mainly hate speech data (targeting class of individuals, not individuals directly)
- *Merge process*:
    - `Inappropriate`: `hate_speech_score` > 0.5
    - `Appropriate`: `hate_speech_score` <= 0.5

In [10]:
measuringhatespeech = pd.read_csv("downloads/measuring_hate_speech.csv", sep=",")[["text", "hate_speech_score"]]

In [11]:
measuringhatespeech["hate_speech_score"].describe()

count    135556.000000
mean         -0.567428
std           2.380003
min          -8.340000
25%          -2.330000
50%          -0.340000
75%           1.410000
max           6.300000
Name: hate_speech_score, dtype: float64

In [12]:
measuringhatespeech.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135556 entries, 0 to 135555
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   text               135556 non-null  object 
 1   hate_speech_score  135556 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.1+ MB


In [13]:
analyze_text_column(measuringhatespeech)

count    135556.000000
mean        151.240742
std         119.987785
min           7.000000
25%          60.000000
50%         115.000000
75%         209.000000
max         603.000000
Name: chars, dtype: float64

In [14]:
measuringhatespeech["inappro"] = (measuringhatespeech["hate_speech_score"] > 0.5).astype(int)

measuringhatespeech = measuringhatespeech[["text", "inappro"]].rename(columns={'inappro': 'class'})

In [15]:
measuringhatespeech["class"].value_counts()

0    86508
1    49048
Name: class, dtype: int64

In [16]:
dataset = pd.concat([dataset, measuringhatespeech], ignore_index=True)

## 3. Insults Dataset
- *Source*: https://www.kaggle.com/competitions/detecting-insults-in-social-commentary/
- *Datatype*: Comments
- *Description*: The label is either 0 meaning a neutral comment, or 1 meaning an insulting comment
- *Comments*: This dataset does not contain particular hate speech, but aims individual
- *Merge process*:
    - `Inappropriate`: `Insult` == 1
    - `Appropriate`: `Insult` == 0

In [17]:
insults_test = pd.read_csv("downloads/insults/test_with_solutions.csv", sep=",")[["Comment", "Insult"]]
insults_train = pd.read_csv("downloads/insults/train.csv", sep=",")[["Comment", "Insult"]]
insults = pd.concat([insults_train, insults_test], ignore_index=True)

insults = insults.rename(columns={"Comment": "text", "Insult": "class"})

In [18]:
insults["class"].value_counts()

0    4852
1    1742
Name: class, dtype: int64

In [19]:
insults.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6594 entries, 0 to 6593
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6594 non-null   object
 1   class   6594 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 103.2+ KB


In [20]:
analyze_text_column(insults)

count     6594.000000
mean       196.484835
std        466.801387
min          6.000000
25%         55.000000
50%        102.000000
75%        204.000000
max      20030.000000
Name: chars, dtype: float64

In [21]:
dataset = pd.concat([dataset, insults], ignore_index=True)

## 4. Jigsaw Dataset
- *Source*: https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/ 
- *Datatype*: Wikipedia Comments and Chats
- *Description*: You are provided with a large number of Wikipedia comments which have been labeled by human raters for toxic behavior. The types of toxicity are: toxic, severe_toxic, obscene, threat, insult, identity_hate
- *Comments*: 
- *Merge process*:
    - `Inappropriate`: one of the class mentionned above
    - `Appropriate`: none

In [22]:
jigsaw_test_labels = pd.read_csv("downloads/jigsaw/test_labels.csv", sep=",")
jigsaw_test = pd.read_csv("downloads/jigsaw/test.csv", sep=',')
jigsaw_test = jigsaw_test.merge(jigsaw_test_labels, on="id")
jigsaw_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153164 entries, 0 to 153163
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             153164 non-null  object
 1   comment_text   153164 non-null  object
 2   toxic          153164 non-null  int64 
 3   severe_toxic   153164 non-null  int64 
 4   obscene        153164 non-null  int64 
 5   threat         153164 non-null  int64 
 6   insult         153164 non-null  int64 
 7   identity_hate  153164 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 10.5+ MB


In [23]:
jigsaw_train = pd.read_csv("downloads/jigsaw/train.csv", sep=",")[["comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate"]]
jigsaw = pd.concat([jigsaw_train, jigsaw_test], ignore_index=True)
jigsaw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312735 entries, 0 to 312734
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   comment_text   312735 non-null  object
 1   toxic          312735 non-null  int64 
 2   severe_toxic   312735 non-null  int64 
 3   obscene        312735 non-null  int64 
 4   threat         312735 non-null  int64 
 5   insult         312735 non-null  int64 
 6   identity_hate  312735 non-null  int64 
 7   id             153164 non-null  object
dtypes: int64(6), object(2)
memory usage: 19.1+ MB


In [24]:
jigsaw["class"] = jigsaw.apply(lambda x: 1 if x['toxic'] == 1 or x['severe_toxic'] == 1 or x['obscene'] == 1 or x['threat'] == 1 or x['insult'] == 1 or x['identity_hate'] == 1 else 0, axis=1)

In [25]:
jigsaw["class"].value_counts()

0    290267
1     22468
Name: class, dtype: int64

In [26]:
jigsaw = jigsaw[["comment_text", "class"]].rename(columns={"comment_text": "text"})

In [27]:
analyze_text_column(jigsaw)

count    312735.000000
mean        379.773262
std         591.767791
min           1.000000
25%          87.000000
50%         193.000000
75%         414.000000
max        5000.000000
Name: chars, dtype: float64

In [28]:
jigsaw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312735 entries, 0 to 312734
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    312735 non-null  object
 1   class   312735 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.8+ MB


In [29]:
dataset = pd.concat([dataset, jigsaw], ignore_index=True)

## 5. Jibes and Delight
- *Source*: https://aclanthology.org/2021.woah-1.14.pdf
- *Datatype*: Reddit Comments
- *Description*: The texts are preprocessed and fetched from different reddit channels to classify them.
- *Comments*: Individual targeting
- *Merge process*:
    - `Inappropriate`: `Insulting`
    - `Appropriate`: none

In [30]:
def constructJibesFile(data_type):
    data = {"text": [], "class": []}
    for i in [0, 1]:
        with open(f"downloads/jibesanddelights/comment.{data_type}.{i}.txt") as f:
            for line in f.readlines():
                data["text"].append(line)
                data["class"].append(i)
    
    return pd.DataFrame(data)

In [31]:
jad_dev = constructJibesFile("dev")
jad_test = constructJibesFile("test")
jad_train = constructJibesFile("train")

jad = pd.concat([jad_dev, jad_test, jad_train], ignore_index=True)

In [32]:
jad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119261 entries, 0 to 119260
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    119261 non-null  object
 1   class   119261 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [33]:
jad["class"].value_counts()

0    68159
1    51102
Name: class, dtype: int64

In [34]:
analyze_text_column(jad)

count    119261.000000
mean         79.466515
std          48.829422
min           8.000000
25%          49.000000
50%          69.000000
75%          96.000000
max        2073.000000
Name: chars, dtype: float64

In [35]:
dataset = pd.concat([dataset, jad], ignore_index=True)

# Data Cleaning
As the data is collected from different sources and media sources, we will try to make it more uniform.

Processes applied:
- Making all text lowercased
- Removing all stop words
- Lemmatizing all words (putting the words to there basic form)
- Try removing no english sentences

In [37]:
lemmatizer = WordNetLemmatizer()

def clean_text(text: str):
    text = text.lower().removeprefix("\"").removesuffix("\"")
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]

    words = [lemmatizer.lemmatize(word) for word in words]
    
    text = ' '.join(words)
    return text

In [38]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598929 entries, 0 to 598928
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    598929 non-null  object
 1   class   598929 non-null  object
dtypes: object(2)
memory usage: 9.1+ MB


In [39]:
dataset["class"].value_counts()

0    453949
1    144980
Name: class, dtype: int64

In [36]:
analyze_text_column(dataset)

count    598929.000000
mean        254.006525
std         454.971904
min           1.000000
25%          62.000000
50%         118.000000
75%         260.000000
max       20030.000000
Name: chars, dtype: float64

In [44]:
from tqdm import tqdm
tqdm.pandas(mininterval=3)

In [40]:
dataset["cleaned"] = dataset.progress_apply(lambda x : clean_text(x["text"]), axis=1)

100%|██████████| 598929/598929 [53:01<00:00, 188.24it/s]  


Removing non-english text


  8%|▊         | 49041/598929 [07:54<1:28:40, 103.35it/s]


KeyboardInterrupt: 

In [45]:
print("Removing non-english text")
dataset = dataset[dataset['text'].progress_apply(lambda x: detect(str(x)) == 'en')]

Removing non-english text


  1%|          | 4597/598929 [00:45<1:37:59, 101.09it/s]


KeyboardInterrupt: 

In [41]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598929 entries, 0 to 598928
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   text     598929 non-null  object
 1   class    598929 non-null  object
 2   cleaned  598929 non-null  object
dtypes: object(3)
memory usage: 13.7+ MB


In [48]:
dataset["class"].value_counts()

0    453949
1    144980
Name: class, dtype: int64

In [47]:
analyze_text_column(dataset, "cleaned")

count    598929.000000
mean        175.127705
std         329.861527
min           0.000000
25%          42.000000
50%          80.000000
75%         177.000000
max       18980.000000
Name: chars, dtype: float64

In [57]:
dataset = dataset[["cleaned", "class"]].rename(columns={"cleaned": "text"})

# Final

In [58]:
#Display dataset info
total = len(dataset)
app = len(dataset[(dataset["class"] == 0)])
inapp = len(dataset[(dataset["class"] == 1)])
print(f"Total text:\t{total}\nClass 0:\t{app}\t\t({int(app / total * 100)}%)\nClass 1:\t{inapp}\t\t({int(inapp / total * 100)}%)")

Total text:	598929
Class 0:	453949		(75%)
Class 1:	144980		(24%)


In [None]:
analyze_text_column(dataset)

In [59]:
def describe_dataset(dataframe: pd.DataFrame, total_length):
    total = len(dataframe)
    app = len(dataframe[(dataframe["class"] == 0)])
    inapp = len(dataframe[(dataframe["class"] == 1)])
    return f"\n\t{total}\t\t{int(total/total_length*100)}%\n\t\t({int(app / total * 100)}%\t{int(inapp / total * 100)}%)"

In [60]:
# Split Dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['class'], test_size=0.2, stratify=dataset['class'])
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

train = pd.DataFrame({"text": X_train, "class": y_train})
test = pd.DataFrame({"text": X_test, "class": y_test})
validate = pd.DataFrame({"text": X_validation, "class": y_validation})


print("Train: ", describe_dataset(train, total))
print("Test: ", describe_dataset(test, total))
print("Validation: ", describe_dataset(validate, total))

Train:  
	383314		63%
		(75%	24%)
Test:  
	119786		20%
		(75%	24%)
Validation:  
	95829		16%
		(75%	24%)


In [62]:
# Save Dataset
train.to_csv("data/cleaned/train.csv")
validate.to_csv("data/cleaned/validate.csv")
test.to_csv("data/cleaned/test.csv")
dataset.to_csv("data/cleaned/full.csv")