# Toxic Comment Classification

## Importing Libraries:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from wordcloud import STOPWORDS
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer
import math
from collections import Counter
from sklearn.model_selection import train_test_split,cross_val_score
import string
import os


## Reading Data:

In [None]:
train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv', index_col='id', engine='python')
train_df.head()

In [None]:
test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv', index_col='id', engine='python')
test_df.head()

**No Null values for target and comment_text columns**

In [None]:
print("Train and test shape: {} {}".format(train_df.shape, test_df.shape))

## Exploratory Data Analysis:

### 1. Target Feature:

In [None]:
# Get toxic and non-toxic comments.
temp = train_df['target'].apply(lambda x: "non-toxic" if x < 0.5 else "toxic")

fig, ax = plt.subplots(1,1,figsize=(5,5))
total = float(len(temp))

cntplot = sns.countplot(temp)
cntplot.set_title('Percentage of non-toxic and toxic comments')

for p in ax.patches:
    # Get height.
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2.0, height + 3, '{:1.2f}%'.format(100*height/total), ha='center')
    
plt.show()

**The dataset is composed as 92% of the comments are non-toxic and only 8% are toxic**

### 2. Toxicity Subtype Features:
- severe_toxicity
- obscene
- threat
- insult
- identity_attack

In [None]:
# Getting the count of additonal toxicity features in toxic comments data(temp):
def get_comment_nature(row):
    row = [row['severe_toxicity'], row['obscene'], row['identity_attack'], row['insult'], row['threat']]
    
    maxarg = np.argmax(np.array(row))
    
    if maxarg == 0: return 'severe_toxicity'
    elif maxarg == 1: return 'obscene'
    elif maxarg == 2: return 'identity_attack'
    elif maxarg == 3: return 'insult'
    else: return 'threat'

In [None]:
temp = train_df[train_df['target'] > 0.5]
x = temp.apply(get_comment_nature, axis=1) 
fig, ax = plt.subplots(1,1,figsize=(7,7))
total = float(len(x))

cntplot = sns.countplot(x)
cntplot.set_title('Percentage of toxicity nature in toxic comments data')

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2.0, height + 3, '{:1.2f}%'.format(100*height/total), ha='center')
    
plt.show()

**In our train dataset only 8% of the data was toxic. Out of that 8%, 81% of the toxic comments made are insults, 8.37% are identity attacks, 7.20% are obscene, 3.35% are threats and a very small amount of toxic comments are severly toxic.**

## Preprocessing Text and Train-Test Split:

In [None]:
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))
def preprocess(text_string):
    text_string = text_string.lower() # Convert everything to lower case.
    text_string = re.sub('[^A-Za-z0-9]+', ' ', text_string) # Remove special characters and punctuations
    
    x = text_string.split()
    new_text = []
    
    for word in x:
        if word not in stop_words:
            new_text.append(stemmer.stem(word))
            
    text_string = ' '.join(new_text)
    return text_string

In [None]:
%%time
train_df['preprocessed_text'] = train_df['comment_text'].apply(preprocess)

In [None]:
train_df.head()

In [None]:
%%time
test_df['preprocessed_text'] = test_df['comment_text'].apply(preprocess)

In [None]:
feature = train_df[['preprocessed_text']]
output = train_df[['target']]
X_train, X_test, y_train, y_test = train_test_split(feature, output)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
test = test_df[['preprocessed_text']]
test.head()

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
# Saving the files to csv so that we dont need to preprocess again.
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')
y_train.to_pickle('y_train.pkl')
y_test.to_pickle('y_test.pkl')
test.to_pickle('test.pkl')

In [None]:
X_train = pd.read_pickle('X_train.pkl')
X_test = pd.read_pickle('X_test.pkl')
y_train = pd.read_pickle('y_train.pkl')
y_test = pd.read_pickle('y_test.pkl')
test = pd.read_pickle('test.pkl')

In [None]:
X_train

In [None]:
y_train

### Exploratory Data Analysis and Preprocessing Done