In [1]:
# Udacity Machine Learning Capstone

import time
#Data Importing Modules
import pandas as pd
import numpy as np
np.random.seed(42)
import string
import re
from collections import Counter
import pickle
import tensorflow as tf

#Selective Sklearn Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import CuDNNLSTM, CuDNNGRU
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import optimizers
from keras.layers import Lambda
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
import os
os.environ['OMP_NUM_THREADS'] = '4'
import gc
from keras import backend as K
from sklearn.model_selection import KFold

#Text Cleaning Module
from unidecode import unidecode
eng_stopwords = set(stopwords.words("english"))

#Visualization Libraries
import seaborn as sns
from matplotlib import pyplot as plt

import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from matplotlib_venn import venn3

from wordcloud import WordCloud

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation

In [3]:
#reading all input files

# train
print("reading train files")
train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv',encoding='utf-8')

#files will have to cleaned- contains non utf-8 characters
train = train.replace(r'\n',' ', regex=True)
train = train.replace(r'\\',' ', regex=True)
print (train.head())

print ("Now reading test files")
test=pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv',encoding='utf-8')

#Filters out punctuation (filters=’!”#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n’).

#special_character_removal = re.compile(r'[!"#$%&()*,-./:;<=>?@[\\]^_`{|}~\t\n]',re.IGNORECASE)
special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

train['clean_text'] = train['comment_text'].apply(lambda x: clean_text(str(x)))
test['clean_text'] = test['comment_text'].apply(lambda x: clean_text(str(x)))

X_train = train['clean_text'].fillna("something").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test['clean_text'].fillna("something").values


In [7]:
#adding features
def add_features(df):
    
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df

train = add_features(train)
test = add_features(test)

features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)

#Using Standard Scaler to get z score
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)

In [8]:
#Basic EDA
COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#Creating a copy for wordcloud
CATEGORIES = COLUMNS.copy()

train_distribution = train[COLUMNS].sum()\
                            .to_frame()\
                            .rename(columns={0: 'count'})\
                            .sort_values('count')

train_distribution.sort_values('count', ascending=False)    

From the data, we see that  the three major labels are :
toxic, obscene and insult
Other labels have a very small quantum

In [9]:
train_comb = train.groupby(COLUMNS)\
                    .size()\
                    .sort_values(ascending=False)\
                    .reset_index()\
                    .rename(columns={0: 'count'})
train_comb.head(n=10)

Here, we are checking for class imbalances. From this table, we know that some comments can fall in multiple categories.

In [10]:
#Correlation Matrix
f, ax = plt.subplots(figsize=(9, 6))
f.suptitle('Correlation matrix for categories')
sns.heatmap(train[COLUMNS].corr(), annot=True, linewidths=.5, ax=ax)

In [5]:
train[COLUMNS].corr().abs().unstack().sort_values(ascending=False)

insult, obscene and toxic are highly correlated to each other.

In [11]:
# Correlation matrix of added features

COLUMNS=COLUMNS+['total_length','caps_vs_length', 'num_words','num_unique_words','words_vs_unique']
f, ax = plt.subplots(figsize=(20, 20))
f.suptitle('Correlation matrix for categories and features')
sns.heatmap(train[COLUMNS].corr(), annot=True, linewidths=.5, ax=ax)

In [None]:
#Creating word cloud 
word_counter = {}
def clean_text(text):
    text = re.sub('[{}]'.format(string.punctuation), ' ', text.lower())
    return ' '.join([word for word in text.split() if word not in (eng_stopwords)])

for categ in CATEGORIES:
    d = Counter()
    train[train[categ] == 1]['comment_text'].apply(lambda t: d.update(clean_text(t).split()))
    word_counter[categ] = pd.DataFrame.from_dict(d, orient='index')\
                                        .rename(columns={0: 'count'})\
                                        .sort_values('count', ascending=False)
for w in word_counter:
    wc = word_counter[w]

    wordcloud = WordCloud(
          background_color='black',
          max_words=200,
          max_font_size=100, 
          random_state=4561
         ).generate_from_frequencies(wc.to_dict()['count'])

    fig = plt.figure(figsize=(12, 8))
    plt.title(w)
    plt.imshow(wordcloud)
    plt.axis('off')

    plt.show()            

In [14]:
from keras.preprocessing import text, sequence

#https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/
#Keras provides the text_to_word_sequence() function that you can use to split text into a list of words.
max_features=20000
maxlen = 50

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_test_sequence = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train_sequence, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequence, maxlen=maxlen)
#print(tokenizer.word_index)


#word index created, each word will have an index
# pass words through embedding to get corresponding values


In [None]:
max_features=20000
maxlen = 50

In [15]:
import csv

#Reference: https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/
# load embedding as a dict
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_vectors = np.zeros((nb_words,501))

def load_embedding(filename):
    file = open(filename,'r')
    lines = file.readlines()
    file.close()
    embedding = dict()
    for line in lines:
        parts = line.split()
        embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
    return embedding

def get_weight_matrix(embedding, vocab):
# total vocabulary size plus 0 for unknown words
    #controlling vocab size using max features
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 200))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        vector = embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix

raw_embedding = load_embedding('../input/glove-twitter-27b-200d-txt/glove.twitter.27B.200d.txt')

# get vectors in the right order #taking only count till nb_words
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)[:nb_words,:] #Will be used to create embedding layer

In [16]:
#testing embedding data
print(list(tokenizer.word_counts.keys())[:2])
print(list(tokenizer.word_index.keys())[:2]) # emdding vector will align to word index and now to word doc or word counts
print(list(tokenizer.word_docs.keys())[:2])

In [17]:
#sum(raw_embedding['explanation']==embedding_vectors[1] returns zero
#sum(raw_embedding['the']==embedding_vectors[1]) #returns 200 - same as dimension of glove data 200d
#sum(raw_embedding['after']==embedding_vectors[1]) returns zero

raw_embedding['hate']

In [None]:
#Base Accuracy - Predicting all labels as non toxic
# using train_comb
print("Base Accuracy - Predicting all labels as non toxic ")
(1-train_comb[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].mean())*100.0

For an unbalanced classification problem, marking everything as non toxic will give the above accuracies. Our neural model needs to work better than the base accuracy

In [17]:
# ROC - Boiler Plate Code

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("*** New High Score (previous: %.6f) \n" % self.max_score)
                model.save_weights("best_weights.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 3:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True

In [18]:
def get_model(features,clipvalue=1.,num_filters=40,dropout=0.5,embed_size=200):
    features_input = Input(shape=(features.shape[1],))
    inp = Input(shape=(maxlen, ))    
    x = Embedding(max_features, embed_size, weights=[embedding_vectors], trainable=False,name='EmbeddingLayer')(inp)
    #x = SpatialDropout1D(dropout)(x)
    #x = Bidirectional(LSTM(num_filters, return_sequences=True),name='BidirectionalLSTM')(x)
    x, x_h, x_c = Bidirectional(GRU(num_filters, return_sequences=True, return_state = True),name='BidirectionalGRU')(x)  
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)    
    x = concatenate([avg_pool, x_h, max_pool,features_input])
    outp = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=[inp,features_input], outputs=outp)
    adam = optimizers.adam(clipvalue=clipvalue)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    return model

In [None]:
get_model(features).summary()

In [None]:
model = get_model(features)
batch_size = 32
epochs = 5
gc.collect()
K.clear_session()
num_folds = 5 
predict = np.zeros((test.shape[0],6))
scores = []
oof_predict = np.zeros((train.shape[0],6))
kf = KFold(n_splits=num_folds, shuffle=True, random_state=239)
for train_index, test_index in kf.split(x_train):
    kfold_y_train,kfold_y_test = y_train[train_index], y_train[test_index]
    kfold_X_train = x_train[train_index]
    kfold_X_features = features[train_index]
    kfold_X_valid = x_train[test_index]
    kfold_X_valid_features = features[test_index] 
    gc.collect()
    K.clear_session()
    model = get_model(features)
    ra_val = RocAucEvaluation(validation_data=([kfold_X_valid,kfold_X_valid_features], kfold_y_test), interval = 1)
    model.fit([kfold_X_train,kfold_X_features], kfold_y_train, batch_size=batch_size, epochs=epochs, verbose=1,
             callbacks = [ra_val])
    gc.collect()
    model.load_weights("best_weights.h5")
    predict += model.predict([x_test,test_features], batch_size=batch_size,verbose=1) / num_folds
    gc.collect() #- Running out of kaggle memory
    oof_predict[test_index] = model.predict([kfold_X_valid, kfold_X_valid_features],batch_size=batch_size, verbose=1)
    cv_score = roc_auc_score(kfold_y_test, oof_predict[test_index])
    scores.append(cv_score)
    print('Cross Validation Score: ',cv_score)

print("Model Completion for Keras DL")
print('Total CV score is {}'.format(np.mean(scores)))   
print ("Saving Predictions for offline upload")
sample_submission = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv")
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
sample_submission[class_names] = predict
sample_submission.to_csv('UdactiyAssignment_v1.csv',index=False)


In [None]:
print ("Code Run Completed")