## Preprocessing and Training Data Development

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Text processing

### Load Data

In [2]:
train_file_path = "../data/train_EDA.csv"
test_file_path = "../data/test_EDA.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path).fillna(" ")

# Examine first 5 train observations
train_df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
7064,ccbfa10635980090,I dont understand the take down of RSK! it is ...,0,0,0,0,0,0,0,1
17446,f6d19826762b62ae,"Oh, and before I get back to not caring about ...",0,0,0,0,1,0,1,0
8112,02cbe8d8e800f936,Where is the real vagina? \n\nThe real photo o...,1,0,0,0,0,0,1,0
24191,fab1d401d504cf68,whats up gook \n\nWhats up you asian gook chin...,1,0,0,0,0,1,2,0
878,4a050ff0b2708326,Image:Hug MSG Incident.jpg \nHi there! I chan...,0,0,0,0,0,0,0,1


In [3]:
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
0,73e51924905f2dcf,Please refrain from removing content from Wiki...,0,0,0,0,0,0,0,1
1,13b1da3b1d0fa0d0,For example Mostafa Malekian is a thinker whic...,0,0,0,0,0,0,0,1
2,351543aa0bba57ee,REDIRECT Talk:Five pence (Irish coin),0,0,0,0,0,0,0,1
3,ebf96fb0a6a8cbb9,in published dictionaries and,0,0,0,0,0,0,0,1
4,ad01d6108ec293a4,:I wouldn't worry about it too much for the mo...,0,0,0,0,0,0,0,1


At this point, the dataset has been preprocessed. It does not have null values, nor values that do not make sense. We have added the column `is_clean` that has the value of `1` when `comment_text` is not labeled as an *offensive* comment.

### Check for NaN values

In [4]:
# Check train dataframe
print(train_df.isna().sum())

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
tags             0
is_clean         0
dtype: int64


In [5]:
# Check test DataFrame
print(test_df.isna().sum())

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
tags             0
is_clean         0
dtype: int64


### Dealing with Text Data

In [6]:
# Copy train dataset for manipulation
train_comments_df = train_df.copy(deep=True)

# Copy test dataset for transformation
test_comments_df = test_df.copy(deep=True)

In [7]:
# Print shape of both comments dataframes
print("Train dataset: ", train_comments_df.shape)
print("Test dataset: ", test_comments_df.shape)

Train dataset:  (24225, 10)
Test dataset:  (5000, 10)


In [8]:
# Print a sample comment.
print(train_comments_df.sample()["comment_text"].values[0])

Is he really Jewish? ==
My friend Jamal said Jesus was a Jew.  How come he didnt get killed in the ovens then?
I don't think Jamal knows anything.  So whats up?

==


In [9]:
def prep_comments(df):

    # Remove special characters from comments
    df["comment_text"] = df["comment_text"].str.replace("[^a-zA-Z]", " ", regex=True)

    # Convert to lower case
    df["comment_text"] = df["comment_text"].str.lower()

    # Length of text
    df["char_count"] = df["comment_text"].str.len()

    # Word counts
    df["word_count"] = df["comment_text"].str.split().str.len()

    # Average length of word
    df["avg_word_len"] = df["char_count"] / df["word_count"]
    return df

# Preprocess train and test comments
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_comments_df = prep_comments(train_comments_df)
test_comments_df = prep_comments(test_comments_df)

# Get rid of observations with empty comments
train_comments_df = train_comments_df[train_comments_df["word_count"] != 0]
test_comments_df = test_comments_df[test_comments_df["word_count"] != 0]

In [10]:
train_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
6372,839febe1441473cc,was elvis blond just wondering if its true i...,0,0,0,0,0,0,0,1,114,22,5.181818
9534,27fc84ebf94d89a6,yeah for real stupid,1,0,1,0,1,0,3,0,23,4,5.75
6805,41b8c79cadcca5ef,how completely dull yet another example of...,0,0,0,0,0,0,0,1,389,66,5.893939
13134,85143faecc05c313,fuck off quit lying to people chumps,1,1,1,0,1,0,4,0,39,7,5.571429
13244,881fbf355aa43b3c,i may assure you that i am sober by now someh...,1,0,0,0,0,0,1,0,695,136,5.110294


In [11]:
test_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
1220,edf3a067e472eb4a,hmmm i m not sure synecdoche is a city in...,0,0,0,0,0,0,0,1,143,28,5.107143
4160,8a83598fe1447ace,tm in schools hi bwb i have put...,0,0,0,0,0,0,0,1,619,114,5.429825
1098,50e079a48d341bdc,for an encyclopedia article wp mos,0,0,0,0,0,0,0,1,36,6,6.0
1097,0c7f6331dae58fbe,but it is there and sourced the dr...,0,0,0,0,0,0,0,1,303,51,5.941176
967,aa980f8760e57aa9,fags i shit on eggs,1,0,1,0,1,0,3,0,28,5,5.6



### Vectorize text with scikit-learn

In [12]:
# Initializing the vectorizer
vectorizer = TfidfVectorizer(stop_words="english",
                             max_df=0.5,
                             min_df=1,
                             ngram_range=(1, 2),
                             max_features=5000)

# Transforming text
vectorizer.fit(train_comments_df["comment_text"])
train_comment_trans = vectorizer.transform(train_comments_df["comment_text"]).toarray()
test_comment_trans = vectorizer.transform(test_comments_df["comment_text"]).toarray()

In [13]:
print("Vectorized train DataFrame: ", train_comment_trans.shape)
print("Vectorized test DataFrame: ", test_comment_trans.shape)

Vectorized train DataFrame:  (24225, 5000)
Vectorized test DataFrame:  (4976, 5000)


In [14]:
# Create DataFrames with vectorized text
train_vec_df = pd.DataFrame(train_comment_trans, columns=vectorizer.get_feature_names_out())
test_vec_df = pd.DataFrame(test_comment_trans, columns=vectorizer.get_feature_names_out())

In [15]:
# Concatenate DataFrames
train_concat_df = pd.concat([train_comments_df, train_vec_df], axis=1)
test_concat_df = pd.concat([test_comments_df, test_vec_df], axis=1)

In [16]:
train_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,youcaltlas continue,young,youre,yourselfgo,yourselfgo fuck,youtube,zero,zionist,zuck,zuckerberg
0,4812fdf09bc8fc46,i m afraid that you didn t follow the history ...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,d9f2f633dce07c67,style border spacing px margin px ...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,33a8b2393d346005,clans before using any words,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3f6fb24b6e8c9a11,the actors names in parenthesis i m reading ...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5b5399363c42d377,further to notability mireille astore was one...,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
test_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,youcaltlas continue,young,youre,yourselfgo,yourselfgo fuck,youtube,zero,zionist,zuck,zuckerberg
0,73e51924905f2dcf,please refrain from removing content from wiki...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13b1da3b1d0fa0d0,for example mostafa malekian is a thinker whic...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,351543aa0bba57ee,redirect talk five pence irish coin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ebf96fb0a6a8cbb9,in published dictionaries and,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ad01d6108ec293a4,i wouldn t worry about it too much for the mo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Drop NaN observations after vectorizing
test_concat_df.dropna(axis=0, inplace=True)

In [19]:
test_concat_df.shape

(4952, 5013)

In [20]:
print(train_concat_df.isna().any(axis=1).sum())
print(test_concat_df.isna().any(axis=1).sum())

0
0


### Save CSV files for modeling

In [21]:
train_concat_df.to_csv("../data/train_ready.csv", index=False)
test_concat_df.to_csv("../data/test_ready.csv", index=False)