## Preprocessing and Training Data Development

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Text processing

### Load Data

In [2]:
train_file_path = "../data/train_EDA.csv"
test_file_path = "../data/test_EDA.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path).fillna(" ")

# Examine first 5 train observations
train_df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
28569,4ba3ea549132afdc,"""\n\n General editing practice - not about eu-...",0,0,0,0,0,0,0,1
111099,5253bfbf1fbeea91,"Felix, that is what they were bound to say it ...",0,0,0,0,0,0,0,1
76340,cc5f3166df5b5703,If it's not in French just remove the paramete...,0,0,0,0,0,0,0,1
131329,bea4b62bb14b04cb,"""\n\n Daddy's not happy with me and says you c...",0,0,0,0,0,0,0,1
138556,e54e0602a673845b,"""\n\n VIC tags \n\nHi, just alerting you to Us...",0,0,0,0,0,0,0,1


In [3]:
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,0,1
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,0,1
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,0,1
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,0,1
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,0,1


At this point, the dataset has been preprocessed. It does not have null values, nor values that do not make sense. We have added the column `is_clean` that has the value of `1` when `comment_text` is not labeled as an *offensive* comment.

### Dealing with Text Data

In [4]:
# Copy train dataset for manipulation
train_comments_df = train_df.copy(deep=True)

# Copy test dataset for transformation
test_comments_df = test_df.copy(deep=True)

In [5]:
# Print shape of both comments dataframes
print("Train dataset: ", train_comments_df.shape)
print("Test dataset: ", test_comments_df.shape)

Train dataset:  (159571, 10)
Test dataset:  (63978, 10)


In [6]:
# Print a sample comment.
print(train_comments_df.loc[149313,"comment_text"])

hey 

why must you be such a shithole? fuck off


In [7]:
def prep_comments(df):

    # Remove special characters from comments
    df["comment_text"] = df["comment_text"].str.replace("[^a-zA-Z]", " ", regex=True)

    # Convert to lower case
    df["comment_text"] = df["comment_text"].str.lower()

    # Length of text
    df["char_count"] = df["comment_text"].str.len()

    # Word counts
    df["word_count"] = df["comment_text"].str.split().str.len()

    # Average length of word
    df["avg_word_len"] = df["char_count"] / df["word_count"]
    return df

# Preprocess train and test comments
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_comments_df = prep_comments(train_comments_df)
test_comments_df = prep_comments(test_comments_df)

# Get rid of observations with empty comments
train_comments_df = train_comments_df[train_comments_df["word_count"] != 0]
test_comments_df = test_comments_df[test_comments_df["word_count"] != 0]

In [8]:
train_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
143607,ffff9d956986994e,well at least someone did to me it was qui...,0,0,0,0,0,0,0,1,112,22,5.090909
2149,05cd3f74b3b96c91,hapsford please cite sources for this article ...,0,0,0,0,0,0,0,1,134,22,6.090909
58893,9dba6ce6bd3faac2,bill james is actually a pen name of author ja...,0,0,0,0,0,0,0,1,56,11,5.090909
108560,4462a926cf64b036,berwickshire i have added the category be...,0,0,0,0,0,0,0,1,259,45,5.755556
156546,cffb6c4c96c19743,current tour protest the hero is currently t...,0,0,0,0,0,0,0,1,145,24,6.041667


In [9]:
test_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
15153,3c5c2a05517fe90f,at the time i made the link there was no pag...,0,0,0,0,0,0,0,1,182,36,5.055556
49839,c67d41d52fc7ef10,origin of the name bilbo this is a qu...,0,0,0,0,0,0,0,1,874,148,5.905405
19922,4f392666adeb6b9e,http eml berkeley edu sdellavi wp fox...,0,0,0,0,0,0,0,1,557,78,7.141026
8058,207c831ff08e1cb1,supratau dabar pjaunuosi su ka kokiu ...,0,0,0,0,0,0,0,1,666,111,6.0
29491,758811e69daff981,that is a strong source to indicate his actu...,0,0,0,0,0,0,0,1,133,25,5.32



### Vectorize text with scikit-learn

In [10]:
# Initializing the vectorizer
vectorizer = TfidfVectorizer(stop_words="english",
                             max_df=0.5,
                             min_df=1,
                             ngram_range=(1, 2),
                             max_features=1000)

# Transforming text
vectorizer.fit(train_comments_df["comment_text"])
train_comment_trans = vectorizer.transform(train_comments_df["comment_text"]).toarray()
test_comment_trans = vectorizer.transform(test_comments_df["comment_text"]).toarray()

In [11]:
print("Vectorized train DataFrame: ", train_comment_trans.shape)
print("Vectorized test DataFrame: ", test_comment_trans.shape)

Vectorized train DataFrame:  (159564, 1000)
Vectorized test DataFrame:  (63578, 1000)


In [12]:
# Create DataFrames with vectorized text
train_vec_df = pd.DataFrame(train_comment_trans, columns=vectorizer.get_feature_names_out())
test_vec_df = pd.DataFrame(test_comment_trans, columns=vectorizer.get_feature_names_out())

In [13]:
# Concatenate DataFrames
train_concat_df = pd.concat([train_comments_df, train_vec_df], axis=1)
test_concat_df = pd.concat([test_comments_df, test_vec_df], axis=1)

In [14]:
train_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,writing,written,wrong,wrote,www,yeah,year,years,yes,york
0,0000997932d777bf,explanation why the edits made under my userna...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.371103
1,000103f0d9cfb60f,d aww he matches this background colour i m s...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,hey man i m really not trying to edit war it...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,more i can t make any real suggestions on im...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,you sir are my hero any chance you remember...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
test_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,writing,written,wrong,wrote,www,yeah,year,years,yes,york
0,0001ea8717f6de06,thank you for understanding i think very high...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000247e83dcc1211,dear god this site is horrible,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0002f87b16116a7f,somebody will invariably try to add relig...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0003e1cccfd5a40a,it says it right there that it is a type ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154027
4,00059ace3e3e9a53,before adding a new product to the lis...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save CSV files for modeling

In [16]:
train_concat_df.to_csv("../data/train_ready.csv")
test_concat_df.to_csv("../data/test_ready.csv")