In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
len(df.comment_text.max())

In [None]:
df.describe()

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Input,GlobalMaxPooling1D
from keras.layers import Conv1D,MaxPooling1D,Embedding
from keras.models import Sequential
from sklearn.metrics import roc_auc_score


In [None]:
# some configuration
max_sequence_length = 100
max_vocab_size = 20000
embedding_dim = 100
validation_split = 0.2
batch_size = 128
epoch = 10

In [None]:
lem = WordNetLemmatizer()
ps = PorterStemmer()

In [None]:
len(df)

In [None]:

#corpus = []
#for i in range(0,len(df)):
 #   review = re.sub('[^0-9a-zA-Z]',' ',df['comment_text'][i])
  #  review = review.lower()
   # review = review.split()
    
    #review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    #review = ' '.join(review)
    #corpus.append(review)
# doing the above would take a lot of time so we just load in the pretrained values using the glove file

In [None]:
# load in pre-trained vectors
# loading word vectors by using pre trained glove.6B.txt file
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('../input/glove6b/glove.6B.%sd.txt' % embedding_dim)) as f:
    # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        print('Found %s word vectors,.' % len(word2vec))

In [None]:

df.isnull().sum()

In [None]:
# prepare text samples and their lables
print('loading in comments...')
sentences = df['comment_text'].values
# sentences stores the comments in the form of an array
sentences[0]


In [None]:
type(sentences)

In [None]:
possible_lables = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
targets = df[possible_lables].values

In [None]:
print('max sequence length:',max(len(s) for s in sentences))
print('min sequence length:',min(len(s) for s in sentences))
s= sorted(len(s) for s in sentences)
print('median sequence length', s[len(s) // 2])

In [None]:
# convert the sentences into tokens/integers
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

In [None]:
# pad sequences so that we get a NxT matrix
data = pad_sequences(sequences,maxlen=max_sequence_length)
print('shape of our data tensor', data.shape)

In [None]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(max_vocab_size, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word2idx.items():
      if i < max_vocab_size:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all zeros.
          embedding_matrix[i] = embedding_vector

In [None]:
# load pre-trained word embeddings into an embeddding layer
embedding_layer = Embedding(num_words,
                           embedding_dim,
                           weights=[embedding_matrix],
                           input_length= max_sequence_length,
                           trainable = False)

print('Building Model...')

In [None]:
model = Sequential()
input_ = Input(shape=(max_sequence_length,))

model.add(Embedding(num_words,
                           embedding_dim,
                           weights=[embedding_matrix],
                           input_length= max_sequence_length,
                           trainable = False))

model.add(Conv1D(128,3,activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(128,3,activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(128,3,activation='relu'))
model.add(GlobalMaxPooling1D())

model.add(Dense(len(possible_lables),activation='sigmoid'))

model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',patience=2)

In [None]:
print('Training model')

r = model.fit(data,targets,
              batch_size=batch_size,
             epochs=epoch,
             validation_split=validation_split,
             callbacks=[early_stop])

In [None]:
model_losses = pd.DataFrame(r.history)

In [None]:
model_losses

In [None]:
model_losses[['loss','val_loss']].plot()

In [None]:
model_losses[['accuracy','val_accuracy']].plot()

In [None]:
p = model.predict(data)
aucs = []
for j in range(6):
    auc = roc_auc_score(targets[:,j],p[:,j])
    aucs.append(auc)
print(np.mean(aucs))