# Data Processing
This notebook processes raw/downloaded data and compiles it into our dataset

In [36]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...


In [2]:
dataset = pd.DataFrame(columns=["text", "class"])

# Raw Data

## 1. HSAOL

In [3]:
hsaol = pd.read_csv("downloads/hsaol.csv", sep=",")[["tweet","class"]]

In [4]:
hsaol["tweet"] = hsaol["tweet"].str.replace("^!*", "")
hsaol["tweet"] = hsaol["tweet"].str.replace("\sRT\s@.*:", " ")
hsaol["inappro"] = (hsaol["class"] < 2).astype(int)

hsaol = hsaol[["tweet", "inappro"]].rename(columns={'tweet': 'text', 'inappro': 'class'})

  hsaol["tweet"] = hsaol["tweet"].str.replace("^!*", "")
  hsaol["tweet"] = hsaol["tweet"].str.replace("\sRT\s@.*:", " ")


In [5]:
dataset = pd.concat([dataset, hsaol], ignore_index=True)

## 2. Measuring Hate Speech

In [6]:
measuringhatespeech = pd.read_csv("downloads/measuring_hate_speech.csv", sep=",")[["text", "hate_speech_score"]]

In [7]:
measuringhatespeech["inappro"] = (measuringhatespeech["hate_speech_score"] > 0.5).astype(int)

measuringhatespeech = measuringhatespeech[["text", "inappro"]].rename(columns={'inappro': 'class'})

In [8]:
dataset = pd.concat([dataset, measuringhatespeech], ignore_index=True)

## 3. Insults Dataset

In [9]:
insults_test = pd.read_csv("downloads/insults/test_with_solutions.csv", sep=",")[["Comment", "Insult"]]
insults_train = pd.read_csv("downloads/insults/train.csv", sep=",")[["Comment", "Insult"]]
insults = pd.concat([insults_train, insults_test], ignore_index=True)

insults = insults.rename(columns={"Comment": "text", "Insult": "class"})

In [10]:
dataset = pd.concat([dataset, insults], ignore_index=True)

## 4. Jigsaw Dataset

In [11]:
jigsaw_test_labels = pd.read_csv("downloads/jigsaw/test_labels.csv", sep=",")
jigsaw_test = pd.read_csv("downloads/jigsaw/test.csv", sep=',')
jigsaw_test = jigsaw_test.merge(jigsaw_test_labels, on="id")
jigsaw_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153164 entries, 0 to 153163
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             153164 non-null  object
 1   comment_text   153164 non-null  object
 2   toxic          153164 non-null  int64 
 3   severe_toxic   153164 non-null  int64 
 4   obscene        153164 non-null  int64 
 5   threat         153164 non-null  int64 
 6   insult         153164 non-null  int64 
 7   identity_hate  153164 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 10.5+ MB


In [12]:
jigsaw_train = pd.read_csv("downloads/jigsaw/train.csv", sep=",")[["comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate"]]
jigsaw = pd.concat([jigsaw_train, jigsaw_test], ignore_index=True)
jigsaw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312735 entries, 0 to 312734
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   comment_text   312735 non-null  object
 1   toxic          312735 non-null  int64 
 2   severe_toxic   312735 non-null  int64 
 3   obscene        312735 non-null  int64 
 4   threat         312735 non-null  int64 
 5   insult         312735 non-null  int64 
 6   identity_hate  312735 non-null  int64 
 7   id             153164 non-null  object
dtypes: int64(6), object(2)
memory usage: 19.1+ MB


In [13]:
jigsaw["class"] = jigsaw.apply(lambda x: 1 if x['toxic'] == 1 or x['severe_toxic'] == 1 or x['obscene'] == 1 or x['threat'] == 1 or x['insult'] == 1 or x['identity_hate'] == 1 else 0, axis=1)

In [14]:

jigsaw = jigsaw[["comment_text", "class"]].rename(columns={"comment_text": "text"})

In [None]:
jigsaw.info()

In [15]:
dataset = pd.concat([dataset, jigsaw], ignore_index=True)

## 5. Jibes and Delight

In [16]:
def constructJibesFile(data_type):
    data = {"text": [], "class": []}
    for i in [0, 1]:
        with open(f"downloads/jibesanddelights/comment.{data_type}.{i}.txt") as f:
            for line in f.readlines():
                data["text"].append(line)
                data["class"].append(i)
    
    return pd.DataFrame(data)

In [17]:
jad_dev = constructJibesFile("dev")
jad_test = constructJibesFile("test")
jad_train = constructJibesFile("train")

jad = pd.concat([jad_dev, jad_test, jad_train], ignore_index=True)

In [18]:
dataset = pd.concat([dataset, jad], ignore_index=True)

# Data Cleaning

In [45]:
lemmatizer = WordNetLemmatizer()

def clean_text(text: str):
    text = text.lower().removeprefix("\"").removesuffix("\"")
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]

    words = [lemmatizer.lemmatize(word) for word in words]
    
    text = ' '.join(words)
    return text

In [46]:
dataset["cleaned"] = dataset.apply(lambda x : clean_text(x["text"]), axis=1)

# Final

In [22]:
#Display dataset info
total = len(dataset)
app = len(dataset[(dataset["class"] == 0)])
inapp = len(dataset[(dataset["class"] == 1)])
print(f"Total text:\t{total}\nClass 0:\t{app}\t\t({int(app / total * 100)}%)\nClass 1:\t{inapp}\t\t({int(inapp / total * 100)}%)")

Total text:	598929
Class 0:	453949		(75%)
Class 1:	144980		(24%)


In [24]:
# Split Dataset

train, validate, test = np.split(dataset.sample(frac=1), [int(.7*len(dataset)), int(.85*len(dataset))])


print(f"Train: {len(train)} {int(len(train)/len(dataset)*100)}%")
print(f"Validate: {len(validate)} {int(len(validate)/len(dataset)*100)}%")
print(f"Test: {len(test)} {int(len(test)/len(dataset)*100)}%")

Train: 419250 69%
Validate: 89839 14%
Test: 89840 15%


In [25]:
# Save Dataset
train.to_csv("data/cleaned/train.csv")
validate.to_csv("data/cleaned/validate.csv")
test.to_csv("data/cleaned/test.csv")
dataset.to_csv("data/cleaned/full.csv")