## Preprocessing and Training Data Development

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Text processing

### Load Data

In [2]:
train_file_path = "../data/train_EDA.csv"
test_file_path = "../data/test_EDA.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path).fillna(" ")

# Examine first 5 train observations
train_df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
13730,958b1f3b8803ff76,And you'll be pleased to know that this articl...,1,0,0,0,0,0,1,0
22249,ed9bf670451fa629,Jime1138 I never said the articled was retract...,0,0,1,0,0,0,1,0
1820,5bcbe1d66862afec,Skip again \nCan I just draw your attention to...,0,0,0,0,0,0,0,1
22399,f5fb5799b16bc7f5,He's notable alright as being a friggen racist...,1,0,0,0,0,0,1,0
18603,2c2a14cd62bacacc,"Hi ya buddy \nLook, don't be a butt humper. A...",1,0,1,0,1,0,3,0


In [3]:
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
0,73e51924905f2dcf,Please refrain from removing content from Wiki...,0,0,0,0,0,0,0,1
1,13b1da3b1d0fa0d0,For example Mostafa Malekian is a thinker whic...,0,0,0,0,0,0,0,1
2,351543aa0bba57ee,REDIRECT Talk:Five pence (Irish coin),0,0,0,0,0,0,0,1
3,ebf96fb0a6a8cbb9,in published dictionaries and,0,0,0,0,0,0,0,1
4,ad01d6108ec293a4,:I wouldn't worry about it too much for the mo...,0,0,0,0,0,0,0,1


At this point, the dataset has been preprocessed. It does not have null values, nor values that do not make sense. We have added the column `is_clean` that has the value of `1` when `comment_text` is not labeled as an *offensive* comment.

### Dealing with Text Data

In [4]:
# Copy train dataset for manipulation
train_comments_df = train_df.copy(deep=True)

# Copy test dataset for transformation
test_comments_df = test_df.copy(deep=True)

In [5]:
# Print shape of both comments dataframes
print("Train dataset: ", train_comments_df.shape)
print("Test dataset: ", test_comments_df.shape)

Train dataset:  (24225, 10)
Test dataset:  (5000, 10)


In [6]:
# Print a sample comment.
print(train_comments_df.sample()["comment_text"].values[0])

"{{unblock|THANK YOU for a reasonable fucking comment!!! The ONLY USER in ANY AREA who has actually made one!!  (Ebyabe made comments which appear to be reasonable at first glance, but are actually unreasonable because one is fraudulent and the other was untrue regarding a place where she could see that it was untrue.)  Well a sort-of reasonable comment anyway because he claims that I am being ""confrontational"" when it is clearly visible at my talk page that other users are attacking me and not the other way around.  I have not caused any confrontation...   I have replied to other users who are putting a confrontation to me.  (I did ask how to report a user, but I did not actually report him because nobody answered regardless of several posts made, and the user himself did not comment there, so that was not a confrontation like it would have been if I'd been able to make the report.)  I would like to leave a message at this user User:Stwalkerster's talk page but I cannot, because for

In [7]:
def prep_comments(df):

    # Remove special characters from comments
    df["comment_text"] = df["comment_text"].str.replace("[^a-zA-Z]", " ", regex=True)

    # Convert to lower case
    df["comment_text"] = df["comment_text"].str.lower()

    # Length of text
    df["char_count"] = df["comment_text"].str.len()

    # Word counts
    df["word_count"] = df["comment_text"].str.split().str.len()

    # Average length of word
    df["avg_word_len"] = df["char_count"] / df["word_count"]
    return df

# Preprocess train and test comments
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_comments_df = prep_comments(train_comments_df)
test_comments_df = prep_comments(test_comments_df)

# Get rid of observations with empty comments
train_comments_df = train_comments_df[train_comments_df["word_count"] != 0]
test_comments_df = test_comments_df[test_comments_df["word_count"] != 0]

In [8]:
train_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
12537,746a6c6f347221fb,idiot we both know where you re really from ...,1,0,0,0,0,0,1,0,88,17,5.176471
21079,ae68d9ed0a92058a,your definition of vandalism is wack just ...,1,0,0,0,0,0,1,0,240,44,5.454545
13576,90f7597a8c8b21c0,this article is so slanted to the socialist in...,1,0,1,0,1,0,3,0,685,110,6.227273
22948,39411f0f74479b28,yo bro i d understand that you hour banne...,1,0,0,0,0,0,1,0,101,21,4.809524
3407,12c2f4f04036a579,dembski confirms explicitly lack of acceptance...,0,0,0,0,0,0,0,1,108,17,6.352941


In [9]:
test_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
4392,1e96d32f3fc30857,i apologize for supporting you i hope th...,0,0,0,0,0,0,0,1,102,19,5.368421
201,edd79e1fa074aa1d,then i d list java c objective c c ch ...,0,0,0,0,0,0,0,1,150,32,4.6875
3118,ee0a926da99367ef,a barnstar for you style ba...,0,0,0,0,0,0,0,1,420,54,7.777778
446,f64524a515a92d04,this article is full of bullshit,1,0,0,0,0,0,1,0,33,6,5.5
1052,146661845f771912,delete this you dick head,1,0,1,0,1,0,3,0,26,5,5.2



### Vectorize text with scikit-learn

In [10]:
# Initializing the vectorizer
vectorizer = TfidfVectorizer(stop_words="english",
                             max_df=0.5,
                             min_df=1,
                             ngram_range=(1, 2),
                             max_features=5000)

# Transforming text
vectorizer.fit(train_comments_df["comment_text"])
train_comment_trans = vectorizer.transform(train_comments_df["comment_text"]).toarray()
test_comment_trans = vectorizer.transform(test_comments_df["comment_text"]).toarray()

In [11]:
print("Vectorized train DataFrame: ", train_comment_trans.shape)
print("Vectorized test DataFrame: ", test_comment_trans.shape)

Vectorized train DataFrame:  (24225, 5000)
Vectorized test DataFrame:  (4976, 5000)


In [12]:
# Create DataFrames with vectorized text
train_vec_df = pd.DataFrame(train_comment_trans, columns=vectorizer.get_feature_names_out())
test_vec_df = pd.DataFrame(test_comment_trans, columns=vectorizer.get_feature_names_out())

In [13]:
# Concatenate DataFrames
train_concat_df = pd.concat([train_comments_df, train_vec_df], axis=1)
test_concat_df = pd.concat([test_comments_df, test_vec_df], axis=1)

In [14]:
train_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,youcaltlas continue,young,youre,yourselfgo,yourselfgo fuck,youtube,zero,zionist,zuck,zuckerberg
0,4812fdf09bc8fc46,i m afraid that you didn t follow the history ...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,d9f2f633dce07c67,style border spacing px margin px ...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,33a8b2393d346005,clans before using any words,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3f6fb24b6e8c9a11,the actors names in parenthesis i m reading ...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5b5399363c42d377,further to notability mireille astore was one...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
test_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,youcaltlas continue,young,youre,yourselfgo,yourselfgo fuck,youtube,zero,zionist,zuck,zuckerberg
0,73e51924905f2dcf,please refrain from removing content from wiki...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13b1da3b1d0fa0d0,for example mostafa malekian is a thinker whic...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,351543aa0bba57ee,redirect talk five pence irish coin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ebf96fb0a6a8cbb9,in published dictionaries and,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ad01d6108ec293a4,i wouldn t worry about it too much for the mo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save CSV files for modeling

In [16]:
train_concat_df.to_csv("../data/train_ready.csv", index=False)
test_concat_df.to_csv("../data/test_ready.csv", index=False)