In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

#### Let us prepare the data

In [None]:
df_comments = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_comments.head()

### Let us label the data

In [None]:
df_toxic = df_comments.loc[:,["worker","more_toxic"]]
df_toxic["Target"] = 1


df_normal = df_comments.loc[:,["worker","less_toxic"]]
df_normal["Target"] = 0


df_toxic.columns = ["worker","Text","Target"]
df_normal.columns = ["worker","Text","Target"]


df_final = pd.concat([df_toxic,df_normal],axis=0)

df_final.shape




In [None]:
df_final

### Let us use deep Learning Approach for the model

## Step 1:
#### Converting text to lower Case

In [None]:
df_final["Text"] = df_final["Text"].apply(lambda x: x.lower())
df_final

### Import Deep Learning Libraries

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM,Dense,Dropout,Embedding
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
import re

pattern = "[,!\?\.\n=]*\s+"
df_final["Text"] = df_final["Text"].apply(lambda x: re.split(pattern,x))
Tc = Tokenizer()
df_final["Text"].iloc[3]

### Let us remove puntuations in every word

In [None]:
def remove_puntuations(words):
    words = [re.sub("[\n\.\?!,:]","", x) for x in words if x != ""]
    return words

df_final["Text"] = df_final["Text"].apply(remove_puntuations)
df_final["Text"].iloc[3]

### Let us prepare vocubalory for training

In [None]:
sent = df_final["Text"].iloc[3]
sentences = list(df_final["Text"])

tc = Tokenizer()
Tc.fit_on_texts(sentences)
Tc.texts_to_sequences(sent)

# Let us try on unseen words 

In [None]:
Tc.texts_to_sequences(["hey","cricketer", "Down", "To", "Earth", "Still", "You", "Leave", "Wicket"])

### As the word cricketer is not there in corpus it has assigned empty token

#### It is always important to have a fixed length sequence in RNN let us do preprocessing

- Padding will at the start (Pre padding)
- Truncation will be post

In [None]:

def generate_seq(sent,length=30):
    seq = [x[0] if len(x) == 1 else 0 for x in sent]
    if len(seq) < length:
        res = [0 for x in range(length - len(seq))]
        res.extend(seq)
    elif len(seq) > length:
        res = seq[:length]
    else:
        res = list(seq)
    return res

### Let us experiment on sample sequence

In [None]:
x = Tc.texts_to_sequences(sent)
print(generate_seq(x,length=30),len(generate_seq(x,length=30)))

### Let us experiment on large sequence

In [None]:
seq = [[i] for i in range(51)]

### You should get everything except 50
print(generate_seq(seq,length=50))

### Let us build LSTM on top of data

#### Let us split the data to train and validation

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df_final["Text"],df_final["Target"],train_size=0.7,random_state=30)

In [None]:
def prepare_training(X_train,y_train,tokenizer=Tc):
    label = np.zeros((y_train.shape[0],2))
    X_train = X_train.apply(lambda x: tokenizer.texts_to_sequences(x))
    X_train = X_train.apply(lambda x: generate_seq(x,length=300))
    X_train = list(X_train)
    X_train = np.asarray(X_train)
    for i,y in enumerate(y_train):
        if y == 0:
            label[i,0] = 1
        else:
            label[i,1] = 1 
            
    return X_train,label
    

In [None]:
train_x, train_y = prepare_training(X_train,y_train,tokenizer=Tc)
val_x, val_y = prepare_training(X_test,y_test,tokenizer=Tc)

### Build LSTM

In [None]:
num_words = max(Tc.word_index.values()) + 1
model = Sequential()
model.add(Embedding(num_words,200,input_length=100,trainable=False))
model.add(LSTM(256,input_shape=(None,100,200)))
model.add(Dropout(0.1))
model.add(Dense(2,activation="softmax"))

model.compile(optimizer="adam",loss="categorical_crossentropy",metrics="categorical_accuracy")
model.summary()

In [None]:
history = model.fit(train_x,train_y,validation_data=(val_x,val_y),batch_size=300,epochs=50)

### LSTM did not give good results
1. Vanishing and exploding gradient problem

In [None]:
df_final = pd.concat([df_toxic,df_normal],axis=0)
df_final

In [None]:
plt.plot(range(len(history.history['loss'])),history.history["loss"])
plt.xlabel("Epoch")
plt.ylabel("loss")

In [None]:
df_final["Text"] = df_final["Text"].apply(lambda x: x.lower())
puntuations = "[\.\?\n!=#\$,;]+\s*"
df_final["Text"] = df_final["Text"].apply(lambda x: re.sub(puntuations," ",x))
df_final

### Let us vectorize the text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train,X_test,y_train,y_test = train_test_split(df_final["Text"],df_final["Target"],random_state=30,train_size=0.8)

### Intanciate vectorizer
vc = TfidfVectorizer()

X_train = vc.fit_transform(X_train)


x_test = vc.transform(X_test)

### Let us use random forest classifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
rf = RandomForestClassifier(random_state=300,max_depth=100)

model = rf.fit(X_train,y_train)



### Let us evaluate

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_train_pred = rf.predict(X_train)
cf = confusion_matrix(y_train,y_train_pred)
cf

In [None]:
def metric(cf):
    acc = (cf[0,0]+cf[1,1])/np.sum(cf)
    si = (cf[1,1]/(cf[1,0] + cf[1,1]))
    spec = (cf[0,0]/(cf[0,1] + cf[0,0]))
    print(f"acc= {acc} recall= {si} specificity={spec}")

metric(cf)

In [None]:

X_test = vc.transform(X_test)
y_test_pred = rf.predict(X_test)
cf = confusion_matrix(y_test,y_test_pred)
cf

In [None]:
metric(cf)

### model still overfitting

### Let us still go for running on model on test set

In [None]:
df_test = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_test.shape

In [None]:
df_test["text"] = df_test["text"].apply(lambda x: x.lower())
df_test["text"] = df_test["text"].apply(lambda x: re.sub(puntuations," ",x))
X_final = vc.transform(df_test["text"])
y_final = rf.predict(X_final)

In [None]:
df_submission = df_test.loc[:,["text"]]
df_submission["score"] = y_final

df_submission.to_csv("submission.csv")