In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
import re 
import scipy
from scipy import sparse
import gc 
from IPython.display import display, HTML
from pprint import pprint
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
df_test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
df_train.head(10)

In [None]:
#No null values in the dataset
df_train.info()

In [None]:
df_train.columns

In [None]:
#Counts for each category
total_rows = df_train.shape[0]
for col_name in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(col_name, "%.2f" % (df_train[df_train[col_name]>0].shape[0]/total_rows*100))

In [None]:
#threats, identity hate and severe_toxic comments are less in the distributions. 
#Their weights need to be increased to rebalance the data. 

df_train['threat'] = df_train['threat']*4
df_train['identity_hate'] = df_train['identity_hate']*3
df_train['severe_toxic'] = df_train['severe_toxic']*2

In [None]:
#Scoring the toxicity of data by summing up  the attributes
df_train['toxicity_score'] = df_train[['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']].sum(axis = 1)

In [None]:
#unique toxicity_score value
df_train['toxicity_score'].unique()

In [None]:
#Statements that are highly toxic
df_train[df_train['toxicity_score']>11].head(5)

In [None]:
#Counting occurance of each type of 'toxicity_score'
df_train_count = df_train['toxicity_score'].value_counts().sort_index()
plt.bar(df_train_count.index, df_train_count.values)
plt.title("Count of statements with toxicity score")

In [None]:
#Normalizing toxicity score
df_train['toxicity_score_norm'] = df_train['toxicity_score']/df_train['toxicity_score'].max()
df_train['toxicity_score_norm'].value_counts().sort_index()

**The text needs to be cleaned so as to improve model performance.**
Referance - 
https://towardsdatascience.com/nlp-in-python-data-cleaning-6313a404a470

In [None]:
#Remove punctuations
import string
string.punctuation

In [None]:
#Example to understand to check the code working
example_sentence = "I love to wear 'adidas' shoes"
get_words = [words for words in example_sentence if words not in string.punctuation]
print(''.join(get_words))

In [None]:
def remove_punctuation(sentence):
    get_words=[words for words in sentence if words not in string.punctuation]
    sentences_wo_punct=''.join(get_words)
    return sentences_wo_punct
df_train['text_wo_punct']=df_train['comment_text'].apply(lambda x: remove_punctuation(x))
df_train.head(5)

In [None]:
#Removing stop words
import nltk
stopword = nltk.corpus.stopwords.words('english')
print(stopword[:11])

In [None]:
def remove_stopwords(sentence):
    get_words=[words for words in sentence if words not in string.punctuation]
    sentences_wo_stop_words =''.join(get_words)
    return sentences_wo_stop_words
df_train['text_wo_punct_and_stop']=df_train['text_wo_punct'].apply(lambda x: remove_stopwords(x))
df_train['text_wo_punct_and_stop'].head(10)



In [None]:
#Tokenazing and Lemmatizing the sentences
tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()



In [None]:
def lemmatize_sent(sentence):
    lemmatized_words = [lemmatizer.lemmatize(words) for words in tokenizer.tokenize(sentence)]
    sentences_lemmatized =' '.join(lemmatized_words)
    return sentences_lemmatized

In [None]:
df_train['text_lemmatized']=df_train['text_wo_punct_and_stop'].apply(lambda x: lemmatize_sent(x))
df_train['text_lemmatized'].head(10)


**Approach#1: Using TF-IDF and Linear Regression anlays**is

In [None]:
vec_tokens = TfidfVectorizer(ngram_range=(3,5))

In [None]:
token_words = vec_tokens.fit_transform(df_train['text_lemmatized'].values)
#vec_tokens.vocabulary_

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
#training the algorithm
regressor = LinearRegression()  
regressor.fit(token_words,df_train['toxicity_score_norm']) 

In [None]:
y_pred = regressor.predict(X)
y_pred.to_csv()

In [None]:
RFC_classifier = RandomForestRegressor(n_estimators = 5, random_state = 0)
RFC_classifier.fit(token_words,df_train['toxicity_score_norm'])

In [None]:
y_pred = RFC_classifier.predict(X)
y_pred.to_csv()

#Usign deep learning LSTM network to improve the model R2 score 

In [None]:
from tensorflow.keras.layers import Dense, Dropout, GRU, Embedding, LSTM, Bidirectional
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
#Setting up hyperparameters
vocab_size = 50000   
max_seq_length = 100 
embedding_dim = 128 
epochs = 10 
batch_size = 64
lr = 0.001

In [None]:
#Defining LSTM model

model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length = max_seq_length),
        LSTM(128, return_sequences=True),
        Dropout(0.1),    
        LSTM(64, return_sequences = True),
        Dropout(0.1),
        LSTM(32, return_sequences = False),
        Dropout(0.1),
        Dense(1, activation="relu")
    ])

In [None]:
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(df_train['text_lemmatized'].values)
token_words_lstm = tokenizer.texts_to_sequences(X_train.values)
token_words_lstm = pad_sequences(token_words_lstm, maxlen = max_seq_length)

In [None]:
model = lstm_model()
        model.compile(
            optimizer= RMSProp(learning_rate= lr),
            loss='mean_squared_error',
            metrics=['MSE'],
    )

In [None]:
history = model.fit(token_words_lstm, df_train['toxicity_score_norm'], 
                batch_size = batch_size, 
                epochs = epochs)

In [None]:
df_test['text_wo_punct']=df_test['comment_text'].apply(lambda x: remove_punctuation(x))
df_test['text_wo_punct_and_stop']=df_test['text_wo_punct'].apply(lambda x: remove_stopwords(x))
df_test['text_lemmatized']=df_test['text_wo_punct_and_stop'].apply(lambda x: lemmatize_sent(x))

    
token_words_lstm_test = tokenizer.texts_to_sequences(df_test['text_lemmatized'].values)
token_words_lstm_test = pad_sequences(oken_words_lstm_test, maxlen = max_seq_length)
pred = model.predict(token_words_lstm_test)