# Load packages

In [1]:
import numpy as np
import pandas as pd
import sklearn
import zipfile
import string
import re
import nltk

import tensorflow as tf

# Load data

In [2]:
with zipfile.ZipFile("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip","r") as zip_ref:
    zip_ref.extractall("./")

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Few preprocessing
A few preprocessing before tokenization. Tensorflow Keras' built-in tokenizer already include some preprocessing, here we do some preprocessing that are not included but suitable for a deep learning model approach.
## Contractions

In [3]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.0.43-py2.py3-none-any.whl (6.0 kB)
Collecting textsearch
  Downloading textsearch-0.0.17-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.0.tar.gz (312 kB)
[K     |████████████████████████████████| 312 kB 6.2 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25ldone
[?25h  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.0-cp37-cp37m-linux_x86_64.whl size=99057 sha256=23753ea8e40036ff7fab63e7406dcc7477e209dbd76931113e9eaf1d82229c5a
  Stored in directory: /root/.cache/pip/wheels/9b/6b/f7/62dc8caf183b125107209c014e78c340a0b4b7b392c23c2db4
Successfully built pyahocorasick
Installing collected packages: pyahocorasick, textsearch, contractions
Successfully installed contractions-0.0.43 pyahocorasick-1.4.0 textsearch-0.0.17


In [4]:
import contractions

df["text_clean"] = df["comment_text"].apply(lambda x: contractions.fix(x))

## Remove URL and HTTP tags

In [5]:
def remove_URL(text):
    """
        Remove URLs from a sample string
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)


def remove_html(text):
    """
        Remove the html in sample text
    """
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)

df["text_clean"] = df["text_clean"].apply(lambda x: remove_URL(x))
df["text_clean"] = df["text_clean"].apply(lambda x: remove_html(x))

## Remove Non-ASCI

In [6]:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r'', text)

df["text_clean"] = df["text_clean"].apply(lambda x: remove_non_ascii(x))

## Remove special characters

In [7]:
def remove_special_characters(text):
    """
        Remove special special characters, including symbols, emojis, and other graphic characters
    """
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df["text_clean"] = df["text_clean"].apply(lambda x: remove_special_characters(x))

# Split and store data

In [8]:
label_col = list(df.columns[2:8])
x = df['text_clean'].values
y = df[label_col].values

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# Tokenization

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
vocab_size = 20000

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(list(x))

In [10]:
len(tokenizer.word_index)

192394

In [None]:
tokenizer.word_index

**A note here is although our vocab_size is set to 20000, the tokenizer still keeps 192395 in the word_index.
word_index is computed the same way no matter how many most frequent words you will use later. So when you call any transformative method - Tokenizer will use only the vocab_size most common words and at the same time, it will keep the counter of all words - even when it's obvious that it will not use it later.**

In [11]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

# Add padding

In [12]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 150
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

In [13]:
x[0]

'Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They were not vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please do not remove the template from the talk page since I am retired now.89.205.38.27'

In [14]:
example = tokenizer.texts_to_sequences([x[0]])
example = pad_sequences(example, padding='post', maxlen=maxlen)
example

array([[  661,    75,     1,   123,   127,   172,    29,   644,  4448,
        11794,  1068,    83,   313,    48,    83,    10, 11223,    53,
         6775,    15,    60,  2699,   144,     3,  2863,    36,   114,
         1176, 15683,  2756,     5,    47,    20,    10,   232,     1,
          349,    32,     1,    40,    28,   140,     3,    37,  3372,
           87,  3003,  4523,  2235,   997,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [15]:
tokenizer.sequences_to_texts(example)

['explanation why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now 89 205 38 27']

# Load pretrained embedding

In [16]:
embeddings_dictionary = dict()

glove_file = open('../input/glove6b100dtxt/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()



def create_glove(word_index,embeddings_index):
    emb_mean,emb_std = -0.005838499,0.48782197
    all_embs = np.stack(embeddings_index.values())
    embed_size = all_embs.shape[1]
    nb_words = min(vocab_size, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    count_found = nb_words
    for word, i in word_index.items():
        if i >= vocab_size: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] =  embedding_vector
        else:
                count_found-=1
    print("Got embedding for ",count_found," words.")
    return embedding_matrix


In [17]:
embedding_matrix = create_glove(tokenizer.word_index,embeddings_dictionary)

  exec(code_obj, self.user_global_ns, self.user_ns)


Got embedding for  18783  words.


In [18]:
all(embedding_matrix[1] == embeddings_dictionary['the'])

True

In [19]:
all(embedding_matrix[tokenizer.word_index['which']] == embeddings_dictionary['which'])

True

# Model building with pretrained Glove embeddings

In [20]:
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
import tensorflow_addons as tfa

def model_add():
    inputs = Input(shape=(maxlen, ))
    x = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(inputs)
    x = Bidirectional(LSTM(50))(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    f1 = tfa.metrics.F1Score(num_classes=1, average='micro',threshold=0.5)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc',f1])
    return model
model_w_glove = model_add()
print(model_w_glove.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 100)          2000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 100)               60400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                

In [21]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(monitor="val_loss", mode="min", patience=20)
history = model_w_glove.fit(x_train, y_train, batch_size=32, epochs=5, validation_data=(x_test,y_test), callbacks=es)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
y_pred2 = model_w_glove.predict(x_test)
y_pred2 = y_pred2 > 0.5
from sklearn.metrics import f1_score
f1_score(y_test, y_pred2, average='micro')

0.7380073800738006