## Preprocessing and Training Data Development

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Text processing

### Load Data

In [2]:
train_file_path = "../data/train_EDA.csv"
test_file_path = "../data/test_EDA.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path).fillna(" ")

# Examine first 5 train observations
train_df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
46495,7c37d4b1bc5a2906,"Text, Transliterations and Translation \n\nUrd...",0,0,0,0,0,0,0,1
8587,16cb72b4f2010f6f,"LOL i know it seems stupid, i'm just not norma...",0,0,0,0,0,0,0,1
63220,a9303ea32e6b684f,.the name palattu koman itself is synonimus wi...,0,0,0,0,0,0,0,1
153750,a201f9adb81ea3fe,"That's all this guy is, a commercial. Many rev...",0,0,0,0,0,0,0,1
45157,78bd4ead65b278df,Comments \n\nFor now this is a central reposit...,0,0,0,0,0,0,0,1


In [3]:
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,0,1
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,0,1
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,0,1
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,0,1
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,0,1


At this point, the dataset has been preprocessed. It does not have null values, nor values that do not make sense. We have added the column `is_clean` that has the value of `1` when `comment_text` is not labeled as an *offensive* comment.

### Dealing with Text Data

In [4]:
# Copy train dataset for manipulation
train_comments_df = train_df.copy(deep=True)

# Copy test dataset for transformation
test_comments_df = test_df.copy(deep=True)

In [5]:
# Print shape of both comments dataframes
print("Train dataset: ", train_comments_df.shape)
print("Test dataset: ", test_comments_df.shape)

Train dataset:  (159571, 10)
Test dataset:  (63978, 10)


In [6]:
# Print a sample comment.
print(train_comments_df.loc[149313,"comment_text"])

hey 

why must you be such a shithole? fuck off


In [7]:
def prep_comments(df):

    # Remove special characters from comments
    df["comment_text"] = df["comment_text"].str.replace("[^a-zA-Z]", " ", regex=True)

    # Convert to lower case
    df["comment_text"] = df["comment_text"].str.lower()

    # Length of text
    df["char_count"] = df["comment_text"].str.len()

    # Word counts
    df["word_count"] = df["comment_text"].str.split().str.len()

    # Average length of word
    df["avg_word_len"] = df["char_count"] / df["word_count"]
    return df

# Preprocess train and test comments
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_comments_df = prep_comments(train_comments_df)
test_comments_df = prep_comments(test_comments_df)

# Get rid of observations with empty comments
train_comments_df = train_comments_df[train_comments_df["word_count"] != 0]
test_comments_df = test_comments_df[test_comments_df["word_count"] != 0]

In [8]:
train_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
98493,0ee156c81606a255,well what is the sentence trying to say we h...,0,0,0,0,0,0,0,1,376,75,5.013333
61416,a462a7b2bb9927fd,not a problem good catch talk page,0,0,0,0,0,0,0,1,42,7,6.0
125517,9f639f7fef5c7e2b,yes for this i want to leave ar wikipedia to...,0,0,0,0,0,0,0,1,62,13,4.769231
723,01f3575d21b13a45,howd homeonttherange your typical probl...,0,0,0,0,0,0,0,1,651,103,6.320388
28255,4acf4d2508b3da4f,he is stealing money from the country for hims...,0,0,0,0,0,0,0,1,257,44,5.840909


In [9]:
test_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
13860,373a0c91a5711d44,farecards what does this mean ...,0,0,0,0,0,0,0,1,224,35,6.4
22718,5a5c0c0b9f0fc213,tanya reinhart a wikipedia article fo...,0,0,0,0,0,0,0,1,134,21,6.380952
605,0268c4571808e8a5,since the article has been split i moved the ...,0,0,0,0,0,0,0,1,121,20,6.05
59907,ef58e5c6d3cedb4d,learn to spell yank,0,0,0,0,0,0,0,1,23,4,5.75
57129,e43ac4f0f8935f4d,chim chiminey chim chiminey chim chim cheroo,0,0,0,0,0,0,0,1,44,7,6.285714



### Vectorize text with scikit-learn

In [10]:
# Initializing the vectorizer
vectorizer = TfidfVectorizer(stop_words="english",
                             max_df=0.5,
                             min_df=1,
                             ngram_range=(1, 2),
                             max_features=5000)

# Transforming text
vectorizer.fit(train_comments_df["comment_text"])
train_comment_trans = vectorizer.transform(train_comments_df["comment_text"]).toarray()
test_comment_trans = vectorizer.transform(test_comments_df["comment_text"]).toarray()

In [11]:
print("Vectorized train DataFrame: ", train_comment_trans.shape)
print("Vectorized test DataFrame: ", test_comment_trans.shape)

Vectorized train DataFrame:  (159564, 5000)
Vectorized test DataFrame:  (63578, 5000)


In [12]:
# Create DataFrames with vectorized text
train_vec_df = pd.DataFrame(train_comment_trans, columns=vectorizer.get_feature_names_out())
test_vec_df = pd.DataFrame(test_comment_trans, columns=vectorizer.get_feature_names_out())

In [13]:
# Concatenate DataFrames
train_concat_df = pd.concat([train_comments_df, train_vec_df], axis=1)
test_concat_df = pd.concat([test_comments_df, test_vec_df], axis=1)

In [14]:
train_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,youbollocks youbollocks,young,yourselfgo,yourselfgo fuck,youth,youtube,youtube com,ytmnd,ytmnd ytmnd,zero
0,0000997932d777bf,explanation why the edits made under my userna...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000103f0d9cfb60f,d aww he matches this background colour i m s...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,hey man i m really not trying to edit war it...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,more i can t make any real suggestions on im...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,you sir are my hero any chance you remember...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
test_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,youbollocks youbollocks,young,yourselfgo,yourselfgo fuck,youth,youtube,youtube com,ytmnd,ytmnd ytmnd,zero
0,0001ea8717f6de06,thank you for understanding i think very high...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000247e83dcc1211,dear god this site is horrible,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0002f87b16116a7f,somebody will invariably try to add relig...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0003e1cccfd5a40a,it says it right there that it is a type ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00059ace3e3e9a53,before adding a new product to the lis...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save CSV files for modeling

In [16]:
train_concat_df.to_csv("../data/train_ready.csv", index=False)
test_concat_df.to_csv("../data/test_ready.csv", index=False)