## Preprocessing and Training Data Development

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Text processing

### Load Data

In [2]:
train_file_path = "../data/train_EDA.csv"
test_file_path = "../data/test_EDA.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path).fillna(" ")

# Examine first 5 train observations
train_df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
9750,2db5f1302a4bc726,fuck you you dumb motherfucking asshole how da...,1,1,1,0,1,0,4,0
10100,35d75d3c23d413b6,you're hot \n\ni will rape you ;),1,1,1,1,1,0,5,0
17568,fa9bb6229a2e47ce,"""\n\n Your attitude totally stinks, Mackcan. A...",1,0,0,0,1,0,2,0
703,ca0e30c207dd6be0,"""\n\nCorenSearchBot needs to be worked on\n \n...",0,0,0,0,0,0,0,1
4305,bf7f031da1eea9c2,"""\nI know you provided the evidence for the fo...",0,0,0,0,0,0,0,1


In [3]:
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
0,73e51924905f2dcf,Please refrain from removing content from Wiki...,0,0,0,0,0,0,0,1
1,13b1da3b1d0fa0d0,For example Mostafa Malekian is a thinker whic...,0,0,0,0,0,0,0,1
2,351543aa0bba57ee,REDIRECT Talk:Five pence (Irish coin),0,0,0,0,0,0,0,1
3,ebf96fb0a6a8cbb9,in published dictionaries and,0,0,0,0,0,0,0,1
4,ad01d6108ec293a4,:I wouldn't worry about it too much for the mo...,0,0,0,0,0,0,0,1


At this point, the dataset has been preprocessed. It does not have null values, nor values that do not make sense. We have added the column `is_clean` that has the value of `1` when `comment_text` is not labeled as an *offensive* comment.

### Check for NaN values

In [4]:
# Check train dataframe
print(train_df.isna().sum())

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
tags             0
is_clean         0
dtype: int64


In [5]:
# Check test DataFrame
print(test_df.isna().sum())

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
tags             0
is_clean         0
dtype: int64


### Dealing with Text Data

In [6]:
# Copy train dataset for manipulation
train_comments_df = train_df.copy(deep=True)

# Copy test dataset for transformation
test_comments_df = test_df.copy(deep=True)

In [7]:
# Print shape of both comments dataframes
print("Train dataset: ", train_comments_df.shape)
print("Test dataset: ", test_comments_df.shape)

Train dataset:  (24225, 10)
Test dataset:  (5000, 10)


In [8]:
# Print a sample comment.
print(train_comments_df.sample()["comment_text"].values[0])

I am so in support of  on this one,  is such an arrogant Wikipedia Nazi and so trying hard to be a Wikipedia Policy Police. g8crash3r


In [9]:
def prep_comments(df):

    # Remove special characters from comments
    df["comment_text"] = df["comment_text"].str.replace("[^a-zA-Z]", " ", regex=True)

    # Convert to lower case
    df["comment_text"] = df["comment_text"].str.lower()

    # Length of text
    df["char_count"] = df["comment_text"].str.len()

    # Word counts
    df["word_count"] = df["comment_text"].str.split().str.len()

    # Average length of word
    df["avg_word_len"] = df["char_count"] / df["word_count"]
    return df

# Preprocess train and test comments
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_comments_df = prep_comments(train_comments_df)
test_comments_df = prep_comments(test_comments_df)

# Get rid of observations with empty comments
train_comments_df = train_comments_df[train_comments_df["word_count"] != 0]
test_comments_df = test_comments_df[test_comments_df["word_count"] != 0]

In [10]:
train_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
9608,2a031ad5235368e6,i know elvis was a turd burgler cos i fucked ...,1,0,1,0,1,0,3,0,156,34,4.588235
5266,eb122fe4cb8baf33,the cancellation of october and helwan gover...,0,0,0,0,0,0,0,1,596,92,6.478261
7161,89a0e83094c8b4a6,don t forget yahoo maps too http ca maps ya...,0,0,0,0,0,0,0,1,495,84,5.892857
511,03758a2d4c0becfb,if this is what you were refering to,0,0,0,0,0,0,0,1,38,8,4.75
8460,0b7985c778a59423,i am not a vandal,1,0,0,0,0,0,1,0,47,5,9.4


In [11]:
test_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
1803,949cb8752e7012a6,emmy photo copyright dispute i own the c...,0,0,0,0,0,0,0,1,235,41,5.731707
4613,1637a4cf5dc0c203,welcome to wikipedia and thank you for how...,0,0,0,0,0,0,0,1,485,83,5.843373
1545,7f1922350355030a,wiki insomniacs i don t know why van...,0,0,0,0,0,0,0,1,561,80,7.0125
660,b542a3ed63f13f12,hey thanks for your clarification i wil...,0,0,0,0,0,0,0,1,129,20,6.45
2196,96c4f10b0958092e,production credits i know this will...,0,0,0,0,0,0,0,1,3938,680,5.791176



### Vectorize text with scikit-learn

In [12]:
# Initializing the vectorizer
vectorizer = TfidfVectorizer(stop_words="english",
                             max_df=0.5,
                             min_df=1,
                             ngram_range=(1, 2),
                             max_features=5000)

# Transforming text
vectorizer.fit(train_comments_df["comment_text"])
train_comment_trans = vectorizer.transform(train_comments_df["comment_text"]).toarray()
test_comment_trans = vectorizer.transform(test_comments_df["comment_text"]).toarray()

In [13]:
print("Vectorized train DataFrame: ", train_comment_trans.shape)
print("Vectorized test DataFrame: ", test_comment_trans.shape)

Vectorized train DataFrame:  (24225, 5000)
Vectorized test DataFrame:  (4976, 5000)


In [14]:
# Create DataFrames with vectorized text
train_vec_df = pd.DataFrame(train_comment_trans, columns=vectorizer.get_feature_names_out())
test_vec_df = pd.DataFrame(test_comment_trans, columns=vectorizer.get_feature_names_out())

In [15]:
# Concatenate DataFrames
train_concat_df = pd.concat([train_comments_df, train_vec_df], axis=1)
test_concat_df = pd.concat([test_comments_df, test_vec_df], axis=1)

In [16]:
train_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,youcaltlas continue,young,youre,yourselfgo,yourselfgo fuck,youtube,zero,zionist,zuck,zuckerberg
0,4812fdf09bc8fc46,i m afraid that you didn t follow the history ...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,d9f2f633dce07c67,style border spacing px margin px ...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,33a8b2393d346005,clans before using any words,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3f6fb24b6e8c9a11,the actors names in parenthesis i m reading ...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5b5399363c42d377,further to notability mireille astore was one...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
test_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,youcaltlas continue,young,youre,yourselfgo,yourselfgo fuck,youtube,zero,zionist,zuck,zuckerberg
0,73e51924905f2dcf,please refrain from removing content from wiki...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13b1da3b1d0fa0d0,for example mostafa malekian is a thinker whic...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,351543aa0bba57ee,redirect talk five pence irish coin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ebf96fb0a6a8cbb9,in published dictionaries and,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ad01d6108ec293a4,i wouldn t worry about it too much for the mo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Drop NaN observations after vectorizing
test_concat_df.dropna(axis=0, inplace=True)

In [19]:
test_concat_df.shape

(4952, 5013)

In [20]:
print(train_concat_df.isna().any(axis=1).sum())
print(test_concat_df.isna().any(axis=1).sum())

0
0


### Save CSV files for modeling

In [21]:
train_concat_df.to_csv("../data/train_ready.csv", index=False)
test_concat_df.to_csv("../data/test_ready.csv", index=False)