[View in Colaboratory](https://colab.research.google.com/github/shubhamitradas/Toxicity-Challenge/blob/master/YoonKim_Glove_Resnet_POSTag.ipynb)

In [2]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM,Bidirectional, Embedding, Dropout, Activation,GlobalMaxPool1D
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import sys
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


#This code below for the Attention implementation borrowed from various sources.
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim
        






Using TensorFlow backend.


In [3]:
EMBEDDING_FILE_RES  = 'numberbatch-en-17.06.txt' 
EMBEDDING_FILE  = 'glove.840B.300d.txt'

TRAIN_DATA_FILE = 'train_toxic.csv'
TEST_DATA_FILE  = 'test_toxic.csv'

MAX_SEQUENCE_LENGTH = 100
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = 100
num_dense = 210
rate_drop_lstm = 0.25
rate_drop_dense = 0.25
act = 'relu'

########################################
## index word vectors,Globe and Resnet
########################################
print('Indexing word vectors')

#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = ' '.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

#Resnet Vectors
embeddings_index_res = {}
f = open(EMBEDDING_FILE_RES)
for line in f:
    values = line.split()
    word = ' '.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index_res[word] = coefs
f.close()


print('Total %s word vectors for Glove.' % len(embeddings_index))
print('Total %s word vectors for Resnet.' % len(embeddings_index_res))

Indexing word vectors
Total 2195896 word vectors for Glove.
Total 417195 word vectors for Resnet.


Here we include both the pre-trained word vectors from Glove and Resnet and concatenate them.
The last column of the embedding matrix  also includes the POS Tag information from NLTK,just trying to make the embedding layer more feature rich.

In [7]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk

########################################
## process texts in datasets
########################################
print('Processing text dataset')
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)


#Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)

#regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    #print("Raw text ",text)
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    #print("Processed text ",text)
    return(text)


list_sentences_train = train_df["comment_text"].fillna("NA").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[list_classes].values
list_sentences_test = test_df["comment_text"].fillna("NA").values


comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))
    
print("Comments :",comments)    

    
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))


tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(comments + test_comments)


sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

print(sequences)
#print(test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)



pos_code_map={'CC':1,'CD':2,'DT':3,'EX':4,'FW':5,'IN':6,'JJ':7,'JJR':8,'JJS':9,'LS':10,'MD':11,'NN':12,'NNS':13,
'NNP':14,'NNPS':15,'PDT':16,'POS':17,'PRP':18,'PRP$':19,'RB':20,'RBR':21,'RBS':22,'RP':23,'SYM':24,'TO':25,'UH':26,
'VB':27,'VBD':28,'VBG':29,'VBN':30,'VBP':31,'VBZ':32,'WDT':33,'WP':34,'WP$':35,'WRB':39,'?':40,'UNK':41}

code_pos_map = {v: k for k, v in  pos_code_map.items()}

def convert(tag):
    try:
        code=pos_code_map[tag]
    except:
        code='UNK'
            
    return code

########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM+EMBEDDING_DIM+1))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
         continue     
    embedding_vector = embeddings_index.get(word)
    embedding_vector_res = embeddings_index_res.get(word)
    tag = nltk.tag.pos_tag([word])
    tag_no = convert(tag[0][1])
    if embedding_vector  is not None:
        if embedding_vector_res is not None :
            embedding_vector = np.append(embedding_vector,embedding_vector_res)
            embedding_vector = np.append(embedding_vector,int(tag_no))
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
   

    
    


print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))



Processing text dataset
Comments : 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Shape of data tensor: (159571, 100)
Shape of label tensor: (159571, 6)
Shape of test_data tensor: (153164, 100)
Preparing embedding matrix
Null word embeddings: 41407


In [0]:
train_df["comment_text"].head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [11]:
from keras.layers import Dense, Input, CuDNNLSTM,CuDNNGRU,Bidirectional, Embedding,ActivityRegularization
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.callbacks import LearningRateScheduler
import keras.backend as K


VALIDATION_SPLIT = 0.1

########################################
## sample train/validation data
########################################
np.random.seed(1234)
perm = np.random.permutation(len(data))
idx_train = perm[:int(len(data)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data)*(1-VALIDATION_SPLIT)):]

data_train=data[idx_train]
labels_train=y[idx_train]
print(data_train.shape,labels_train.shape)

data_val=data[idx_val]
labels_val=y[idx_val]

print(data_val.shape,labels_val.shape)

  
def swish(x):
    return (K.sigmoid(x) * x)
  

def selu(x):
    alpha = 1.6732632423543772848170429916717
    scale = 1.0507009873554804934193349852946
    return scale*tf.where(x>=0.0, x, alpha*tf.nn.elu(x))
  
act = 'relu'  




class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch, score))
            
            
ra_val = RocAucEvaluation(validation_data=(data_val, labels_val), interval=1)



def scheduler(epoch):
    if  epoch > 7:
        lr = K.get_value(model.optimizer.lr)
        K.set_value(model.optimizer.lr, lr*.9)
        print("lr changed to {}".format(lr*.9))
    return K.get_value(model.optimizer.lr)

lr_decay = LearningRateScheduler(scheduler)

lrate = 0.0005
epochs = 10
num_dense = 210
num_lstm = 70
decay = lrate/epochs
#sgd = SGD(lr=lrate, momentum=0.90, decay=decay, nesterov=False)
#rmsprop = RMSprop(lr=0.0001, rho=0.9, epsilon=None, decay=0.0)




(143613, 100) (143613, 6)
(15958, 100) (15958, 6)


In [12]:

###################### Model  ###################################


from keras.optimizers import RMSprop,SGD,Adam,Nadam
from keras.layers import Dense, Input, LSTM,Bidirectional, Embedding, Dropout, Activation,GlobalMaxPooling1D,SpatialDropout1D,GlobalAveragePooling1D,Concatenate



########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM+EMBEDDING_DIM+1,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)

########yoon kim###############################
from keras.layers import Embedding, Input, BatchNormalization, SpatialDropout1D, Conv1D
from keras.layers import Dense, GlobalMaxPooling1D
conv_filters = 128
# Specify each convolution layer and their kernel siz i.e. n-grams 
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
emb= embedding_layer(comment_input)
conv1_1 = Conv1D(filters=conv_filters, kernel_size=3)(emb)
btch1_1 = BatchNormalization()(conv1_1)
drp1_1  = Dropout(0.2)(btch1_1)
actv1_1 = Activation(act)(drp1_1)
glmp1_1 = GlobalMaxPooling1D()(actv1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=4)(emb)
btch1_2 = BatchNormalization()(conv1_2)
drp1_2  = Dropout(0.2)(btch1_2)
actv1_2 = Activation(act)(drp1_2)
glmp1_2 = GlobalMaxPooling1D()(actv1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=5)(emb)
btch1_3 = BatchNormalization()(conv1_3)
drp1_3  = Dropout(0.2)(btch1_3)
actv1_3 = Activation(act)(drp1_3)
glmp1_3 = GlobalMaxPooling1D()(actv1_3)

conv1_4 = Conv1D(filters=conv_filters, kernel_size=6)(emb)
btch1_4 = BatchNormalization()(conv1_4)
drp1_4  = Dropout(0.2)(btch1_4)
actv1_4 = Activation(act)(drp1_4)
glmp1_4 = GlobalMaxPooling1D()(actv1_4)

# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3, glmp1_4], axis=1)
drp1 = Dropout(0.2)(cnct)

dns1  = Dense(32, activation=act)(drp1)
btch1 = BatchNormalization()(dns1)
drp2  = Dropout(0.2)(btch1)

output_layer = Dense(6, activation="sigmoid")(drp2)
model = Model(inputs=comment_input, outputs=output_layer)
model.compile(loss='binary_crossentropy',
                  optimizer=Adam(),#(clipvalue=1, clipnorm=1),
                  metrics=['accuracy'])

#########################################LearningRateScheduler#######



print(model.summary())


early_stopping =EarlyStopping(monitor='acc', patience=3)
bst_model_path = "relu" + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
#model.load_weights(bst_model_path)
hist = model.fit(data_train, labels_train, \
        validation_data=(data_val, labels_val), \
        epochs=epochs, batch_size=256,verbose=2, shuffle=True, \
         callbacks=[model_checkpoint,lr_decay])
         
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

#######################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

from sklearn import metrics
valid_preds = model.predict(data_val, verbose=0)
roc = metrics.roc_auc_score(labels_val, valid_preds)
print("ROC:", roc)

STAMP = 'glove_resnet_postag_concat_%.4f'%(roc)

y_test = model.predict([test_data], batch_size=1024, verbose=2)

sample_submission = pd.read_csv("sample_toxic.csv")
sample_submission[list_classes] = y_test

sample_submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 100, 601)     60100000    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 98, 128)      230912      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 97, 128)      307840      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_7 (

Epoch 8/10
 - 77s - loss: 0.0411 - acc: 0.9842 - val_loss: 0.0487 - val_acc: 0.9818
Epoch 9/10
lr changed to 0.0009000000427477062
 - 77s - loss: 0.0387 - acc: 0.9851 - val_loss: 0.0455 - val_acc: 0.9829
Epoch 10/10
lr changed to 0.0008100000384729356
 - 77s - loss: 0.0370 - acc: 0.9858 - val_loss: 0.0479 - val_acc: 0.9825
Start making the submission before fine-tuning
ROC: 0.9861232269195801


In [0]:
#########################################LearningRateScheduler#######
from keras.callbacks import LearningRateScheduler
import keras.backend as K

epochs = 10

def scheduler(epoch):
    if  epoch > 7:
        lr = K.get_value(model_attn.optimizer.lr)
        K.set_value(model_attn.optimizer.lr, lr*.9)
        print("lr changed to {}".format(lr*.9))
    return K.get_value(model_attn.optimizer.lr)

lr_decay = LearningRateScheduler(scheduler)



###################### Attention Model  ###################################

from keras.optimizers import RMSprop,SGD,Adam,Nadam
from keras.layers import Bidirectional, Dropout, CuDNNGRU
recurrent_units = 84

comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences= embedding_layer(comment_input)
x = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(embedded_sequences)
x = Attention(MAX_SEQUENCE_LENGTH)(embedded_sequences)
x = Dropout(0.2)(x)
x = Dense(64, activation=act)(x)
x = Dropout(0.1)(x)
x = Dense(32, activation=act)(x)
output_layer = Dense(6, activation="sigmoid")(x)
model_attn = Model(inputs=comment_input, outputs=output_layer)


model_attn.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.0005),#(clipvalue=1, clipnorm=1),
                  metrics=['accuracy'])

print(model_attn.summary())


early_stopping =EarlyStopping(monitor='acc', patience=3)
bst_model_path = "relu" + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
#model.load_weights(bst_model_path)
hist = model_attn.fit(data_train, labels_train, \
        validation_data=(data_val, labels_val), \
        epochs=epochs, batch_size=256,verbose=2, shuffle=True, \
         callbacks=[model_checkpoint,lr_decay])
         
model_attn.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

#######################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

from sklearn import metrics
valid_preds = model_attn.predict(data_val, verbose=0)
roc = metrics.roc_auc_score(labels_val, valid_preds)
print("ROC:", roc)

STAMP = 'glove_resnet_postag_attn_%.4f'%(roc)

y_test = model_attn.predict([test_data], batch_size=1024, verbose=2)

sample_submission = pd.read_csv("sample_toxic.csv")
sample_submission[list_classes] = y_test

sample_submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 601)          60100000  
_________________________________________________________________
attention_7 (Attention)      (None, 601)               701       
_________________________________________________________________
dropout_11 (Dropout)         (None, 601)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 64)                38528     
_________________________________________________________________
dropout_12 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 32)                2080      
__________

In [13]:
!ls -lrt

total 6829700
drwxr-xr-x 1 root root       4096 Jun  4 04:03 datalab
-rw-r--r-- 1 root root    6279782 Jun  4 04:03 sample_toxic.csv
-rw-r--r-- 1 root root   68802655 Jun  4 04:03 train_toxic.csv
-rw-r--r-- 1 root root   60354593 Jun  4 04:04 test_toxic.csv
-rw-r--r-- 1 root root 5646239124 Jun  4 04:05 glove.840B.300d.txt
-rw-r--r-- 1 root root  943625690 Jun  4 04:05 numberbatch-en-17.06.txt
drwxr-xr-x 4 root root       4096 Jun  4 04:06 nltk_data
-rw-r--r-- 1 root root  246073896 Jun  4 05:25 relu.h5
-rw-r--r-- 1 root root   22201332 Jun  4 05:27 0.0455_glove_resnet_postag_concat_0.9861.csv


In [14]:
!gzip 0.0455_glove_resnet_postag_concat_0.9861.csv

In [15]:
from google.colab import files


files.download('0.0455_glove_resnet_postag_concat_0.9861.csv.gz')