## Preprocessing and Training Data Development

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Text processing

### Load Data

In [2]:
train_file_path = "../data/train_EDA.csv"
test_file_path = "../data/test_EDA.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path).fillna(" ")

# Examine first 5 train observations
train_df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
37398,63dcc7a817df2df4,Ben_Ben \n\nI hate you. Nobody on wikipedia li...,1,0,0,0,1,0,2,0
88019,eb7570a9496abd7c,"""\n\n """"was a successful American wrestler"""" \...",0,0,0,0,0,0,0,1
50332,868dc56435cabd8a,The thing is that Wikipedia does not engage in...,0,0,0,0,0,0,0,1
105393,33d2c21b9f34fba1,It's kind of odd to be described as an obvious...,0,0,0,0,0,0,0,1
92583,f798a51ec22dd649,Edit request on April 17 2013 \n\nBoth New Zea...,0,0,0,0,0,0,0,1


In [3]:
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,0,1
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,0,1
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,0,1
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,0,1
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,0,1


At this point, the dataset has been preprocessed. It does not have null values, nor values that do not make sense. We have added the column `is_clean` that has the value of `1` when `comment_text` is not labeled as an *offensive* comment.

### Dealing with Text Data

In [4]:
# Copy train dataset for manipulation
train_comments_df = train_df.copy(deep=True)

# Copy test dataset for transformation
test_comments_df = test_df.copy(deep=True)

In [5]:
# Print shape of both comments dataframes
print("Train dataset: ", train_comments_df.shape)
print("Test dataset: ", test_comments_df.shape)

Train dataset:  (159571, 10)
Test dataset:  (63978, 10)


In [6]:
# Print a sample comment.
print(train_comments_df.loc[149313,"comment_text"])

hey 

why must you be such a shithole? fuck off


In [7]:
def prep_comments(df):

    # Remove special characters from comments
    df["comment_text"] = df["comment_text"].str.replace("[^a-zA-Z]", " ", regex=True)

    # Convert to lower case
    df["comment_text"] = df["comment_text"].str.lower()

    # Length of text
    df["char_count"] = df["comment_text"].str.len()

    # Word counts
    df["word_count"] = df["comment_text"].str.split().str.len()

    # Average length of word
    df["avg_word_len"] = df["char_count"] / df["word_count"]
    return df

# Preprocess train and test comments
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_comments_df = prep_comments(train_comments_df)
test_comments_df = prep_comments(test_comments_df)

# Get rid of observations with empty comments
train_comments_df = train_comments_df[train_comments_df["word_count"] != 0]
test_comments_df = test_comments_df[test_comments_df["word_count"] != 0]

In [8]:
train_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
154947,b693737e7c8aae9d,do you know what a circular argument is this ...,0,0,0,0,0,0,0,1,67,13,5.153846
33973,5a9b403f7d673a67,not done it s not clear what changes you w...,0,0,0,0,0,0,0,1,153,27,5.666667
121321,891d1df791bc3baa,admin signed off on the merger months ago here,0,0,0,0,0,0,0,1,49,9,5.444444
44053,7599db3a8fa2b2eb,as it deals with the company s chairman,0,0,0,0,0,0,0,1,41,8,5.125
103856,2ba6f13596ccd97b,still no article on boylstons per se not e...,0,0,0,0,0,0,0,1,615,100,6.15


In [9]:
test_comments_df.sample(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,char_count,word_count,avg_word_len
21366,54d5c940b1fb3fe3,to kemet the possibility that charlotte...,0,0,0,0,0,0,0,1,2125,359,5.91922
6810,1b7c337d10f4eae5,good to know that main pages c...,0,0,0,0,0,0,0,1,121,21,5.761905
3324,0d89aa52e170eddc,see the near the beginning of this ...,0,0,0,0,0,0,0,1,81,11,7.363636
25621,6605c47e5557a645,airasia quality of service dear sfan...,0,0,0,0,0,0,0,1,721,121,5.958678
39844,9ee8f806d027b2e8,neutrality tag why has this tag been...,0,0,0,0,0,0,0,1,59,9,6.555556



### Vectorize text with scikit-learn

In [10]:
# Initializing the vectorizer
vectorizer = TfidfVectorizer(stop_words="english",
                             max_df=0.5,
                             min_df=1,
                             ngram_range=(1, 2),
                             max_features=1000)

# Transforming text
vectorizer.fit(train_comments_df["comment_text"])
train_comment_trans = vectorizer.transform(train_comments_df["comment_text"]).toarray()
test_comment_trans = vectorizer.transform(test_comments_df["comment_text"]).toarray()

In [11]:
print("Vectorized train DataFrame: ", train_comment_trans.shape)
print("Vectorized test DataFrame: ", test_comment_trans.shape)

Vectorized train DataFrame:  (159564, 1000)
Vectorized test DataFrame:  (63578, 1000)


In [12]:
# Create DataFrames with vectorized text
train_vec_df = pd.DataFrame(train_comment_trans, columns=vectorizer.get_feature_names_out())
test_vec_df = pd.DataFrame(test_comment_trans, columns=vectorizer.get_feature_names_out())

In [13]:
# Concatenate DataFrames
train_concat_df = pd.concat([train_comments_df, train_vec_df], axis=1)
test_concat_df = pd.concat([test_comments_df, test_vec_df], axis=1)

In [14]:
train_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,writing,written,wrong,wrote,www,yeah,year,years,yes,york
0,0000997932d777bf,explanation why the edits made under my userna...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.371103
1,000103f0d9cfb60f,d aww he matches this background colour i m s...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,hey man i m really not trying to edit war it...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,more i can t make any real suggestions on im...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,you sir are my hero any chance you remember...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
test_concat_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tags,is_clean,...,writing,written,wrong,wrote,www,yeah,year,years,yes,york
0,0001ea8717f6de06,thank you for understanding i think very high...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000247e83dcc1211,dear god this site is horrible,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0002f87b16116a7f,somebody will invariably try to add relig...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0003e1cccfd5a40a,it says it right there that it is a type ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154027
4,00059ace3e3e9a53,before adding a new product to the lis...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save CSV files for modeling

In [16]:
train_concat_df.to_csv("../data/train_ready.csv", index=False)
test_concat_df.to_csv("../data/test_ready.csv", index=False)