# Imports

In [None]:
# base imports
import os
import numpy as np
import pandas as pd

# text cleaning imports
from bs4 import BeautifulSoup
import re
import nltk

# ml imports
import tensorflow as tf
from tensorflow.keras import callbacks, models, layers
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor

# tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Binarizer

# visualizations
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
MAX_WORDS = 25_000

nltk.download('stopwords')
from nltk.corpus import stopwords

# Create train data

The competition was multioutput

We turn it into a binary toxic/ no-toxic classification

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

# Undersample

The dataset is very unbalanced. Here we undersample the majority class. Other strategies might work better.

In [None]:
min_len = (df['y'] == 1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] == 1], df_y0_undersample])
df['y'].value_counts()

# transform the data

In [None]:
stop_words = stopwords.words("english")
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)
binarizer = Binarizer()

def clean(comment):
    clean_html = BeautifulSoup(comment).get_text()
    clean_non_letters = re.sub("[^a-zA-Z]", " ", clean_html)
    cleaned_lowercase = clean_non_letters.lower()
    words = cleaned_lowercase.split()
    cleaned_words = [w for w in words if w not in stop_words]
    return " ".join(cleaned_words)
    
df['cleaned'] = df['text'].apply(clean)
X = vectorizer.fit_transform(df.cleaned)
X = binarizer.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, df.y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

In [None]:
vocab = vectorizer.get_feature_names()
vocab[:20]

# Gradient boosting

In [None]:
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
model.fit(X_train, y_train, 
          early_stopping_rounds=5, 
          eval_set=[(X_val, y_val)], 
          verbose=True)

In [None]:
pred = model.predict(X_val)
mean_absolute_error(y_val, pred)

# Validate

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
vectorizer.fit_transform(df.cleaned)
X = binarizer.fit_transform(X)
X_less_toxic = binarizer.transform(vectorizer.transform(df_val['less_toxic'].apply(clean)))
X_more_toxic = binarizer.transform(vectorizer.transform(df_val['more_toxic'].apply(clean)))

In [None]:
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)

In [None]:
# Validation Accuracy
(p1[:] < p2[:]).mean()

# Submission

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
X_test = binarizer.transform(vectorizer.transform(df_sub['text'].apply(clean)))
p3 = model.predict(X_test)

In [None]:
df_sub['score'] = p3[:]

In [None]:
df_sub['score'].count()

In [None]:
# 9 comments will fail if compared one with the other
df_sub['score'].nunique()

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)