In [2]:
import  numpy as np
import pandas as pd 
from tqdm import tqdm 
from sklearn.model_selection import train_test_split
import tensorflow as tf 
from keras.models import Sequential
from keras.layers import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import text

from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

  shapely_geos_version, geos_capi_version_string


In [3]:
train=pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
validation=pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test=pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

In [4]:
train.shape,validation.shape,test.shape

((223549, 8), (8000, 4), (63812, 3))

In [5]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [6]:
train=train.loc[:12000,:]
train.shape

(12001, 3)

In [7]:
train['comment_text'].apply(lambda x:len(str(x).split())).max()

1403

In [8]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [9]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, 
                                                  stratify=train.toxic.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [10]:
xtrain.shape,xvalid.shape

((9600,), (2401,))

In [11]:
token=text.Tokenizer(num_words=None)
max_len=1500

In [12]:
token.fit_on_texts(list(xtrain)+list(xvalid))
xtrainseq=token.texts_to_sequences(xtrain)
xvalidseq=token.texts_to_sequences(xvalid)
len(xtrainseq),len(xvalidseq)

(9600, 2401)

In [13]:
from tensorflow.keras.preprocessing import sequence
xtrain_pad=sequence.pad_sequences(xtrainseq,maxlen=max_len)
xvalid_pad=sequence.pad_sequences(xvalidseq,maxlen=max_len)
word_index=token.word_index

In [14]:
xtrain_pad.shape,xvalid_pad.shape

((9600, 1500), (2401, 1500))

In [15]:

    model=Sequential()
    model.add(Embedding(len(word_index)+1,300,input_length=max_len))
    
    model.add(SimpleRNN(100))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         13049100  
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________


In [16]:
with tf.device('/device:GPU:0'):
    model.fit(xtrain_pad,ytrain,epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
scores=model.predict(xvalid_pad)
print('Accuracy:%.2f%%'%(roc_auc(scores,yvalid)))


Accuracy:0.78%


In [20]:
scores_model=[]
scores_model.append(({'Model':'SimpleRNN','AUCscore':roc_auc(scores,yvalid)}))

In [22]:
print(scores_model)

[{'Model': 'SimpleRNN', 'AUCscore': 0.7821490664602491}]
