In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load dataset

In [None]:
import pandas as pd

## Train set
tr = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv')
tr.head(5)

In [None]:
## Validation data set
val = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
val.head(5)

In [None]:
## Sample submission data set
sub = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv')
sub.head(5)


# Data cleaning

In [None]:
import re
import string
def clean(d):
    ## lowercase the reviews
    d = d.apply(lambda x:x.lower())
    ## remove punctuation marks
    d = d.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
    # Removing extra spaces
    d = d.apply(lambda x: re.sub(' +',' ',x))
    ## Remove line breaks
    d = d.replace('\n',' ').replace('\r',' ').replace('...',' ')
    # Remove special characters
    d = d.apply(lambda x:re.sub('[^a-zA-z0-9\s]','',x))

    ## Look at the text after cleaning 
    d.head(5)

In [None]:
clean(tr['text'])

# Feature engineering

Introducing the feature Polarity to score toxicity of comments.

Sentiment analysis is the analysis of how much a piece of text is positive and opinionated.


In [None]:
from textblob import TextBlob
tr['polarity'] = tr['text'].apply(lambda x: round(TextBlob(x).sentiment.polarity),2)
print(" 3 Comments which are positive (highest polarity)")
for index, t in enumerate(tr.iloc[tr['polarity'].sort_values(ascending = True)[:3].index]['text']):
    print(index+1,t,'\n')


In [None]:
tr.head(5)

In [None]:
tr['polarity'].value_counts()

In [None]:
x = tr.drop('polarity',axis = 1)
y = tr['polarity'].values

texts = x.drop('comment_id',axis = 1).copy()
texts.reset_index(inplace = True, drop = True)
texts.head()

# Cleaning Text

* Stemming: 

        Extract the root element of a word.

        Ex: raining, rained, had rained 

        Stem word: rain
        

* Stop words removal

        Stopwords are the most commonly occuring  words in the text carrying no meaning. Ex: 'I', 'This','on','there','here','is','in'.

        We will use nltk library to remove stopwords from the cleaned train set.

In [None]:
## Create a clean text corpus 
##containing lower case words + no stopwords + stem words

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import tqdm

ps = PorterStemmer()
corpus = []

for i in range(0, len(texts)) :
    cleaned = re.sub('[^a-zA-Z]', ' ', texts['text'][i])
    cleaned = cleaned.lower().split()
    
    cleaned = [ps.stem(word) for word in cleaned if not word in stopwords.words('english')]
    cleaned = ' '.join(cleaned)
    corpus.append(cleaned)

## Word2Vec

Convert cleaned text into numbers using gensim

In [None]:
import gensim

DIM = 100

X = [d.split() for d in corpus]
w2v_model = gensim.models.Word2Vec(sentences = X, vector_size = DIM, window = 10, min_count = 1)

# Text Tokenization

Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.

We can pad the data to have same length

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

t = Tokenizer()
t.fit_on_texts(X)
X = t.texts_to_sequences(X)
X = pad_sequences(X, maxlen = 16)
print('Text tokens count ',len(t.word_index))

vocab_size = len(t.word_index) + 1 
vocab = t.word_index

In [None]:
X[:3]

In [None]:
def get_weights_matrix(model) :
    weights_matrix = np.zeros((vocab_size, DIM))
    
    for word, i in vocab.items() :
        weights_matrix[i] = model.wv[word]
        
    return weights_matrix


embedding_vectors = get_weights_matrix(w2v_model) 

## Build the deep learning model

An lstm model can remember, learn and memorise sequences of padded vectors.

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.losses import BinaryCrossentropy

model = Sequential([
    Embedding(vocab_size, output_dim =DIM,weights = [embedding_vectors],input_length = 16,name='embedding'),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(64,activation = 'relu'),
    Dropout(0.2),
    Dense(1,activation = 'sigmoid')
])
model.compile(optimizer='adam',loss= 'mean_squared_error',
             metrics = ['accuracy'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model)

# Train the model



In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size =0.3,random_state = 0)

es = EarlyStopping(patience=3, 
                   monitor='loss', 
                   #restore_best_weights=True, 
                   mode='min', 
                   verbose=1)
# model train
history = model.fit(xtrain,ytrain,
                    validation_data = (xtest, ytest)
                    ,batch_size= 100, 
                    epochs= 1,
                    validation_split=0.1,
                    callbacks=[es],
                    shuffle=True,
                    )


# Prediction

In [None]:
import matplotlib.pyplot as plt
preds = model.predict(X)*1000
plt.hist(preds,label='Model Prediction')
plt.legend()

# Submission

In [None]:
from sklearn.preprocessing import binarize
u = pd.DataFrame()
u['comment_id'] = tr['comment_id']
u['score'] = binarize(preds)
u.to_csv('submission.csv',index=False)