In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load dataset

In [None]:
import pandas as pd

## Train set
tr = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv')
tr.head(5)

In [None]:
## Validation data set
val = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
val.head(5)

In [None]:
## Sample submission data set
sub = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv')
sub.head(5)


# EDA on Comments to score dataset

In [None]:
## Top 5 less toxic comments

val['less_toxic'].value_counts().head(5)


In [None]:
tr['text'].unique()

In [None]:
# ## There are several unwanted punctuation marks in the text.
# ## Some comments have repeated dots and words. We have to clean them

# tr['text'] = tr['text'].apply(lambda x:x.split('...')[0])
# tr['text']

# Data cleaning

In [None]:
import re
import string
def clean(d):
    ## lowercase the reviews
    d = d.apply(lambda x:x.lower())
    ## remove punctuation marks
    d = d.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
    # Removing extra spaces
    d = d.apply(lambda x: re.sub(' +',' ',x))
    ## Remove line breaks
    d = d.replace('\n',' ').replace('\r',' ').replace('...',' ')
    # Remove special characters
    d = d.apply(lambda x:re.sub('[^a-zA-z0-9\s]','',x))

    ## Look at the text after cleaning 
    d.head(5)

In [None]:
clean(tr['text'])

## Stopwords removal & Lemmatization

Stopwords are the most commonly occuring words in the text carrying no meaning. Ex: 'I', 'This','on','there','here','is','in'.

We will use spacy , a nltk library to remove stopwords from the cleaned train set.

Lemmatization removes inflectional endings only and returns the base or dictionary form of a word, which is known as the lemma .
Ex: rained, rainning, has rained, going to rain will be reduced to 'rain'

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

# Lemmatization with stopwords removal
tr['text']=tr['text'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))

In [None]:
tr['text'].head(5)

# Feature engineering

Introducing the feature Polarity to score toxicity of comments.

Sentiment analysis is the analysis of how much a piece of text is positive and opinionated.


In [None]:
from textblob import TextBlob
tr['polarity'] = tr['text'].apply(lambda x: round(TextBlob(x).sentiment.polarity),2)
print(" 3 Comments which are positive (highest polarity)")
for index, t in enumerate(tr.iloc[tr['polarity'].sort_values(ascending = True)[:3].index]['text']):
    print(index+1,t,'\n')
# tr[['polarity','text']].head(3)
tr['polarity_category'] = ['positive' if score > 0
                           else 'negative' if score < 0 
                              else 'neutral' 
                                  for score in tr['polarity']]


In [None]:
tr.groupby(by=['polarity_category']).describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.barplot(x = 'polarity_category', y ='polarity',data = tr)
plt.title('Sentiment analysis of comments')

In [None]:
tr.head(5)

# Text Tokenization

Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

t = Tokenizer()
input_text = tr['text'].values
t.fit_on_texts(input_text)
txt_sequences = t.texts_to_sequences(input_text)
txt_vectors = pad_sequences(txt_sequences, maxlen = 512)
txt_vectors.shape

print('Text tokens count ',len(t.word_index))


## Build the deep learning model

An lstm model can remember, learn and memorise sequences of padded vectors.

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
vocab_size = 60000
embedding_dim = 100
model = Sequential([
    Embedding(vocab_size, embedding_dim,name='embedding'),
    LSTM(64),
    Dropout(0.2),
    Dense(16,activation = 'relu'),
    Dropout(0.2),
    Dense(1,activation = 'sigmoid')
])
model.compile(optimizer='adam',loss=BinaryCrossentropy(from_logits = True),
             metrics = ['accuracy'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model)

# Train the model



In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
import datetime

cp_file = './lstm_model.h5'
cp = ModelCheckpoint(cp_file, 
                     monitor='loss', 
                     verbose=0, 
                     save_best_only=True, mode='min')

es = EarlyStopping(patience=3, 
                   monitor='loss', 
                   #restore_best_weights=True, 
                   mode='min', 
                   verbose=1)
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard("logs")

# model train
history = model.fit(txt_vectors,tr['polarity'].values,
                    batch_size= 500, 
                    epochs= 10,
                    validation_split=0.1,
                    callbacks=[es, cp, tensorboard_callback],
                    shuffle=True,
                    )





In [None]:

# %reload_ext tensorboard
# !kill 302
# %tensorboard --logdir logs

# Prediction

In [None]:
preds = model.predict(txt_vectors)
plt.hist(preds,label='Model Prediction')
plt.legend()

# Submission

In [None]:
from scipy.stats import rankdata
u = pd.DataFrame()
u['comment_id'] = tr['comment_id']
u['score'] = rankdata(preds)
u.to_csv('submission.csv',index=False)