## Preprocessing and Training Data Development

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Text processing

### Load Data

In [2]:
train_file_path = "../data/train_EDA.csv"
test_file_path = "../data/test_EDA.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path).fillna(" ")

# Examine first 5 train observations
train_df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
4759,7007caa09c241129,I did not abuse anything and,0,0,0,0,0,0,0,1
6780,1bc1a4a8d751c348,On Hold I'm putting the article on hold. In th...,0,0,0,0,0,0,0,1
11716,5fd6fa96a552cbb4,fucking GOOK!! \n\ngo kill urself motha fucka,1,1,1,0,1,0,4,0
9366,241895a7555a607c,"""\n\n""""I certainly didnt start the fire"""" what...",1,0,0,0,0,0,1,0
18676,3015d850b3e96095,bring it \n\nhello tommy remember me? im back ...,1,0,1,0,1,0,3,0


In [3]:
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
0,32504958e6378134,is mostly inactive and visually unappealing,0,0,0,0,0,0,0,1
1,965230a40160b971,""" \n\n :A few responses: \n :#Some of your com...",0,0,0,0,0,0,0,1
2,789d5ba07879f808,== Semi-protected edit request on 9 November 2...,0,0,0,0,0,0,0,1
3,cb3cb72274f28aeb,""" \n\n == Playoffs == \n\n Hockey. My country...",0,0,0,0,0,0,0,1
4,1a45b872642fde4a,human anus,1,0,0,0,0,0,1,0


At this point, the dataset has been preprocessed. It does not have null values, nor values that do not make sense. We have added the column `is_clean` that has the value of `1` when `comment_text` is not labeled as an *offensive* comment.

### Check for NaN values

In [4]:
# Check train dataframe
print(train_df.isna().sum())

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
tags             0
is_clean         0
dtype: int64


In [5]:
# Check test DataFrame
print(test_df.isna().sum())

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
tags             0
is_clean         0
dtype: int64


### Dealing with Text Data

In [6]:
# Copy train dataset for manipulation
train_comments_df = train_df.copy(deep=True)

# Copy test dataset for transformation
test_comments_df = test_df.copy(deep=True)

In [7]:
# Print shape of both comments dataframes
print("Train dataset: ", train_comments_df.shape)
print("Test dataset: ", test_comments_df.shape)

Train dataset:  (24225, 10)
Test dataset:  (5000, 10)


In [8]:
# Print a sample comment.
print(train_comments_df.sample()["comment_text"].values[0])

You are a soldier, a dog of war, you understand nothing of power politics. You are welcome to come to Athena, but rest assured, my fellow Hellenes do not like people who come to our country and espouse anti-Greek views. You are clearly a proud American, but I am a proud Greek, and I will be DAMNED if the anti-Greeks are going to get the better of us. I bet you love good old FYROM and Albania as well. Yep, Americans love shitty pseudostates with no grounding in History (so they have to steal it off others), they would be easier to control than a strong Greece and Serbia. Make no mistake though, your reign is coming to an end, sooner or later we'll drive you and the British Murderers off the island, and we'll run the Greek Muslims (so called 'Turkish Cypriots') into the Sea.


In [9]:
def prep_comments(df):

    # Remove special characters from comments
    df["comment_text"] = df["comment_text"].str.replace("[^a-zA-Z]", " ", regex=True)

    # Convert to lower case
    df["comment_text"] = df["comment_text"].str.lower()

    # Length of text
    df["char_count"] = df["comment_text"].str.len()

    # Word counts
    df["word_count"] = df["comment_text"].str.split().str.len()

    # Average length of word
    df["avg_word_len"] = df["char_count"] / df["word_count"]
    return df

# Preprocess train and test comments
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_comments_df = prep_comments(train_comments_df)
test_comments_df = prep_comments(test_comments_df)

# Get rid of observations with empty comments
train_comments_df = train_comments_df[train_comments_df["word_count"] != 0]
test_comments_df = test_comments_df[test_comments_df["word_count"] != 0]

In [10]:
train_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
9727,2d1b3a1d9abd6d64,hey travis deleting shit isn t something ...,1,0,1,0,1,0,3,0,242,43,5.627907
10759,46f0a3faced285a3,f off you f ing b i am telling the ...,1,1,1,0,1,0,4,0,84,18,4.666667
19868,6fe15b1eefd199e1,how s about you shut the fuck up instead ...,1,0,1,0,1,0,3,0,263,48,5.479167
21613,c9b510a174c27520,howard felsher i know you have been on here ...,1,0,0,0,0,0,1,0,209,43,4.860465
11075,4e8a2a1e269a31a7,eternal pink why have you named you main guy i...,1,0,1,0,1,0,3,0,259,53,4.886792


In [11]:
test_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
3334,8028ba1b93ab6014,powerful citation it appears to be ...,0,0,0,0,0,0,0,1,215,35,6.142857
1449,07a60deabf1fd65a,understood but there is still no equality ...,0,0,0,0,0,0,0,1,192,35,5.485714
814,0d7f5c00b0c34e39,thanks hi there i just wanted to say...,0,0,0,0,0,0,0,1,107,19,5.631579
1506,e2bde576204ee903,so the news stories were based a false blog p...,0,0,0,0,0,0,0,1,307,56,5.482143
2856,4a2ae2f99b6dea9c,as far as i can tell the point with the huyge...,0,0,0,0,0,0,0,1,602,110,5.472727



### Vectorize text with scikit-learn

In [12]:
# Initializing the vectorizer
vectorizer = TfidfVectorizer(stop_words="english",
                             max_df=0.5,
                             min_df=1,
                             ngram_range=(1, 2),
                             max_features=5000)

# Transforming text
vectorizer.fit(train_comments_df["comment_text"])
train_comment_trans = vectorizer.transform(train_comments_df["comment_text"]).toarray()
test_comment_trans = vectorizer.transform(test_comments_df["comment_text"]).toarray()

In [13]:
print("Vectorized train DataFrame: ", train_comment_trans.shape)
print("Vectorized test DataFrame: ", test_comment_trans.shape)

Vectorized train DataFrame:  (24225, 5000)
Vectorized test DataFrame:  (4966, 5000)


In [14]:
# Create DataFrames with vectorized text
train_vec_df = pd.DataFrame(train_comment_trans, columns=vectorizer.get_feature_names_out())
test_vec_df = pd.DataFrame(test_comment_trans, columns=vectorizer.get_feature_names_out())

In [15]:
# Concatenate DataFrames
train_concat_df = pd.concat([train_comments_df, train_vec_df], axis=1)
test_concat_df = pd.concat([test_comments_df, test_vec_df], axis=1)

In [16]:
train_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,yourselfgo,yourselfgo fuck,youtube,youtube com,ytmnd,ytmnd ytmnd,zero,zionist,zuck,zuckerberg
0,2ec4b3bd396a3012,i am really confused about this there are so...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16fb18070064a8e8,apology point taken dmcdevit i apologise fo...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2aac02a9ff8d5798,classification it has been hard for me to fi...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35ef99361edb8a3b,okay the rundown so far my first bloc...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,b11068094c52add9,yep not as easy to do though with the lesser...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
test_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,yourselfgo,yourselfgo fuck,youtube,youtube com,ytmnd,ytmnd ytmnd,zero,zionist,zuck,zuckerberg
0,32504958e6378134,is mostly inactive and visually unappealing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,965230a40160b971,a few responses some of your commen...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,789d5ba07879f808,semi protected edit request on november ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.233029,0.271072,0.0,0.0,0.0,0.0,0.0,0.0
3,cb3cb72274f28aeb,playoffs hockey my country goe...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1a45b872642fde4a,human anus,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Drop NaN observations after vectorizing
test_concat_df.dropna(axis=0, inplace=True)

In [19]:
test_concat_df.shape

(4933, 5013)

In [20]:
print(train_concat_df.isna().any(axis=1).sum())
print(test_concat_df.isna().any(axis=1).sum())

0
0


In [21]:
# Preview DataFrame
train_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,yourselfgo,yourselfgo fuck,youtube,youtube com,ytmnd,ytmnd ytmnd,zero,zionist,zuck,zuckerberg
0,2ec4b3bd396a3012,i am really confused about this there are so...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16fb18070064a8e8,apology point taken dmcdevit i apologise fo...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2aac02a9ff8d5798,classification it has been hard for me to fi...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35ef99361edb8a3b,okay the rundown so far my first bloc...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,b11068094c52add9,yep not as easy to do though with the lesser...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
test_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,yourselfgo,yourselfgo fuck,youtube,youtube com,ytmnd,ytmnd ytmnd,zero,zionist,zuck,zuckerberg
0,32504958e6378134,is mostly inactive and visually unappealing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,965230a40160b971,a few responses some of your commen...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,789d5ba07879f808,semi protected edit request on november ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.233029,0.271072,0.0,0.0,0.0,0.0,0.0,0.0
3,cb3cb72274f28aeb,playoffs hockey my country goe...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1a45b872642fde4a,human anus,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save CSV files for modeling

In [23]:
train_concat_df.to_csv("../data/train_ready.csv", index=False)
test_concat_df.to_csv("../data/test_ready.csv", index=False)