# Rate Severity of Toxic Comments: Preprocessing

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(train.shape)
train.head()

In [None]:
data = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
print(data.shape)
data.head()

In [None]:
label_score = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in label_score:
    train[category] = train[category] * label_score[category]

train['severity'] = train.loc[:, 'toxic':'identity_hate'].sum(axis=1)
count = train.query('severity > 0').shape[0]
non_toxic = train[train['severity'] == 0].sample(n=count)
train = pd.concat([train[train['severity'] > 0], non_toxic])
train.shape

In [None]:
n, bins, patches = plt.hist(train['severity'])
plt.xlabel('severity')
plt.ylabel('count')
plt.title('Histogram of Severity')
plt.grid(True)
plt.show()

## Exploratory Data Analysis & Lemmatization - NLTK

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
copy = train['comment_text']
copy = copy.str.lower().str.replace("\n", " ")
# TODO: remove emojis / special characters

for i in range(len(copy)):
    tempArr = []
    for word in copy.iloc[i].split(' '):
        if word.find("fuck") != -1:
            tempArr.append("fuck")
        elif word.find("shit") != -1:
            tempArr.append("shit")
        elif word.find("dick") != -1:
            tempArr.append("dick")
        elif word.find("bitch") != -1:
            tempArr.append("bitch")
        else:
            tempArr.append(lemmatizer.lemmatize(word))
    copy.iloc[i] = ' '.join(tempArr)
    # del stopwords
    copy.iloc[i] = ' '.join([word for word in copy.iloc[i].split(' ') if word not in stop])
    
    # punctuation are not deleted as they might represent emotions

In [None]:
compare = pd.DataFrame([copy,train['comment_text']])
compare = compare.transpose()
compare = compare.reset_index(drop=True)
compare.head(10)

## Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorized = vectorizer.fit_transform(copy)
# copy = 0 # clear space
# compare = 0 # clear space
vectorized = vectorized.toarray()
vectorized = pd.DataFrame(vectorized)
vectorized = vectorized.loc[:,vectorized.sum(axis=0)>50]
vectorized.columns = range(len(vectorized.columns))
vectorized

In [None]:
scores = train.loc[:,'toxic':'identity_hate']
scores = scores.reset_index(drop=True)
scores.head(5)

In [None]:
vectorized = pd.concat([vectorized,train.loc[:,'toxic':'identity_hate'].reset_index(drop=True)], axis = 1)
vectorized

## Modeling

In [None]:
from sklearn.model_selection import train_test_split
# TODO: make a for loop with all score categories
X = vectorized
y = vectorized['toxic'] 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LogisticRegression

In [None]:
# Unusually high score
model = Ridge()
model.fit(x_train, y_train)
score = [model, model.score(x_train, y_train), model.score(
        x_test, y_test), abs(model.score(x_test, y_test) - model.score(x_train, y_train))]
print(pd.DataFrame(data=np.array([score]), columns=[
          'Model', 'Train Set Score', 'Test Set Score', 'Generalization Error']))

In [None]:
# Cannot run because taking too much RAM 
# from sklearn.neighbors import KNeighborsRegressor
# models = [KNeighborsRegressor(n_neighbors=10, weights='distance')]
# result = []
# for model in models:
#     model.fit(x_train, y_train)
#     score = [model, model.score(x_train, y_train), model.score(
#         x_test, y_test), abs(model.score(x_test, y_test) - model.score(x_train, y_train))]
#     result.append(score)