In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use("seaborn-darkgrid")
import spacy

In [None]:
data = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.drop("worker", axis=1, inplace=True)

In [None]:
np.random.rand()

In [None]:
data["target"] = np.ones(shape=(data.shape[0],))

In [None]:
for i in range(data.shape[0]):
    r = np.random.rand()
    if r < 0.5:
        data.loc[i,"less_toxic"], data.loc[i,"more_toxic"] = data.loc[i,"more_toxic"], data.loc[i,"less_toxic"]
        data.loc[i, "target"] = 0

In [None]:
plt.figure(figsize=(18,8))
sns.countplot(x=data["target"], palette="winter")
plt.show()

In [None]:
x = data.drop("target", axis=1)
y = data["target"]

In [None]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

The code in the following cell is an excruciatingly ugly way to remove punctuations and certain stopwords.  
In general we would use nltk and regular expressions, but oh well, it works.

In [None]:
l = ["a", 'b', "a", 'a', "k"]
while True:
    if "a" in l:
        l.remove("a")
    else:
        break
print(l)

In [None]:
stopwords = ['a','this','the','and','is','are', 'have', 'has', 'had', 'to']
for i in range(x.shape[0]):
    less = x.iloc[i,0].replace('"','').split()
    more = x.iloc[i,1].replace('"','').split()
    for c in range(len(less)):
        for pun in ["?", "!", ".", ",", ":", ";"]:
            if pun in less[c]:
                less[c] = less[c].replace(pun, "")
                break
        if less[c] in stopwords:
            less[c] = ""
        less[c] = lemmatizer.lemmatize(less[c].lower())
    for c in range(len(more)):
        for pun in ["?", "!", ".", ",", ":", ";"]:
            if pun in more[c]:
                more[c] = more[c].replace(pun, "")
        if more[c] in stopwords:
            more[c] = ""
        more[c] = lemmatizer.lemmatize(more[c].lower())
    while True:
        if "" in less:
            less.remove("")
        else:
            break
    while True:
        if "" in more:
            more.remove("")
        else:
            break
    x.iloc[i,0] = less
    x.iloc[i,1] = more

In [None]:
len("yes of course")

In [None]:
x.head()

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.1)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tok = Tokenizer()
tok.fit_on_texts(list(xtrain.iloc[:,0])+list(xtrain.iloc[:,1]))

In [None]:
len(tok.word_index)

In [None]:
tok = Tokenizer(num_words = 40000, oov_token="oov")
tok.fit_on_texts(list(xtrain.iloc[:,0])+list(xtrain.iloc[:,1]))

In [None]:
xtrain.iloc[:,0] = tok.texts_to_sequences(xtrain.iloc[:,0])
xtrain.iloc[:,1] = tok.texts_to_sequences(xtrain.iloc[:,1])

xtest.iloc[:,0] = tok.texts_to_sequences(xtest.iloc[:,0])
xtest.iloc[:,1] = tok.texts_to_sequences(xtest.iloc[:,1])

In [None]:
xtrain.shape

In [None]:
xtrain.iloc[:,0]

In [None]:
xtrain.columns

In [None]:
xtr1 = pad_sequences(sequences=xtrain["less_toxic"], maxlen=300)
xtr2 = pad_sequences(sequences=xtrain["more_toxic"], maxlen=300)

xts1 = pad_sequences(sequences=xtest["less_toxic"], maxlen=300)
xts2 = pad_sequences(sequences=xtest["more_toxic"], maxlen=300)

In [None]:
xts1.shape

In [None]:
import tensorflow as tf
from keras import layers, Model
from tensorflow.compat.v1.keras.layers import CuDNNGRU as gru
from tensorflow.compat.v1.keras.layers import CuDNNLSTM as lstm

In [None]:
def encoder(inp,d,reg):
    x = layers.Embedding(40000, 256)(inp)
    x = layers.Dropout(d)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Bidirectional(gru(128,
                             return_sequences=True,
                             kernel_regularizer=tf.keras.regularizers.l1(reg),
                             activity_regularizer=tf.keras.regularizers.l1(reg),
                             recurrent_regularizer=tf.keras.regularizers.l1(reg)))(x)
    x = layers.Dropout(d)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Bidirectional(gru(128,
                             kernel_regularizer=tf.keras.regularizers.l1(reg),
                             activity_regularizer=tf.keras.regularizers.l1(reg),
                             recurrent_regularizer=tf.keras.regularizers.l1(reg)))(x)
    x = layers.Dropout(d)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(d)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(64, activation="relu")(x)
    return x

In [None]:
def build_model(d,reg):
    sen1 = layers.Input(shape=(300,))
    sen2 = layers.Input(shape=(300,))
    enc1 = encoder(sen1, d, reg)
    enc2 = encoder(sen2, d, reg)
    enc = layers.concatenate([enc1, enc2])
    enc = layers.Dropout(d)(enc)
    enc = layers.BatchNormalization()(enc)
    enc = layers.Dense(64, activation="relu")(enc)
    enc = layers.Dropout(d)(enc)
    enc = layers.BatchNormalization()(enc)
    out = layers.Dense(1, activation="sigmoid")(enc)
    return Model(inputs=[sen1,sen2], outputs=out)

In [None]:
model = build_model(0.2, 0.01)

In [None]:
from keras.utils.vis_utils import plot_model

In [None]:
plot_model(model, to_file='./model.png', show_shapes=False, show_layer_names=False)

In [None]:
model.summary()

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.fit(x=[xtr1,xtr2], y=ytrain, validation_data=([xts1,xts2], ytest), batch_size=32, epochs=20)