## Text-CNN model

In the case of NLP tasks, i.e., when applied to text instead of images, we have a 1 dimensional array representing the text. Here the architecture of the ConvNets is changed to 1D convolutional-and-pooling operations.

One of the most typically tasks in NLP where ConvNet are used is sentence classification, that is, classifying a sentence into a set of pre-determined categories by considering n
-grams, i.e. it’s words or sequence of words, or also characters or sequence of characters.

<h3>1-D Convolutions over text </h3>

Given a sequence of words $w_{1:n}=w_1,…,w_n$
, where each is associated with an embedding vector of dimension d. A 1D convolution of width-k is the result of moving a sliding-window of size k over the sentence, and applying the same convolution filter or kernel to each window in the sequence, i.e., a dot-product between the concatenation of the embedding vectors in a given window and a weight vector u, which is then often followed by a non-linear activation function g.

Here, we have used deep CNN model with 4 Conv1D layers where each conv. layer is followed by a maxpooling layer

### Import Libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd 
import os
import gc
import logging
import datetime
import warnings
import pickle
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import time

In [None]:
from tensorflow.compat.v1.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.compat.v1.keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from tensorflow.compat.v1.keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, Flatten
from tensorflow.compat.v1.keras.layers import Conv1D, MaxPooling1D
from tensorflow.compat.v1.keras.preprocessing import text, sequence
from tensorflow.compat.v1.keras.losses import binary_crossentropy
from tensorflow.compat.v1.keras import backend as K
import tensorflow.compat.v1.keras.layers as L
from tensorflow.compat.v1.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.compat.v1.keras.layers import Layer
from tensorflow.compat.v1.keras.models import Model
from tensorflow.compat.v1.keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from tensorflow.compat.v1.keras.preprocessing.text import Tokenizer
from tensorflow.compat.v1.keras.preprocessing.sequence import pad_sequences
from tensorflow.compat.v1.keras.utils import plot_model
from tensorflow.compat.v1.keras.callbacks import TensorBoard

### Data pre-processing function

In [None]:
def get_coefs(word, *arr):
    ''' get word and its weight vector from embeddings'''
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    ''' load weights from two embeddings in dict'''
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

In [None]:
def build_embedding_matrix(word_index, path):
    '''
        credits to: https://www.kaggle.com/christofhenkel/keras-baseline-lstm-attention-5-fold
        buiid embedding matrix for comment text
    '''
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, EMB_MAX_FEAT))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
        except:
            embedding_matrix[i] = embeddings_index["unknown"]
            
    del embedding_index
    gc.collect()
    return embedding_matrix

In [None]:
def tokenizer_text(train, test):
    '''
        credits go to: https://www.kaggle.com/tanreinama/simple-lstm-using-identity-parameters-solution/ 
        tokenize the train and test comment, filter out some special characters
    '''

    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    punct += '©^®` <→°€™› ♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√'
    tokenizer = Tokenizer(filters=punct) 
    tokenizer.fit_on_texts(list(train[COMMENT_TEXT_COL]))
    # it is wordindex dictionary so every word gets a unique integer value. 
    # 0 is reserved for padding. So lower integer means more frequent word
    word_index = tokenizer.word_index
    # it takes each word in the text and 
    # replaces it with its corresponding integer value from the word_index dictionary
    X_train = tokenizer.texts_to_sequences(list(train[COMMENT_TEXT_COL]))
    X_test = tokenizer.texts_to_sequences(list(test[COMMENT_TEXT_COL]))
    # used to ensure that all sequences in a list have the same length. 
    # By default this is done by padding 0 in the beginning of each sequence
    X_train = pad_sequences(X_train, maxlen=MAX_LEN)
    X_test = pad_sequences(X_test, maxlen=MAX_LEN)
    
    return X_train, X_test, word_index

In [None]:
def build_embeddings(word_index):
    '''
        build embedding matrix for each of the embedding and combine them
    '''
    embedding_matrix = np.concatenate(
        [build_embedding_matrix(word_index, f) for f in EMB_PATHS], axis=-1) 
    return embedding_matrix

In [None]:
def load_data():
    '''load input data'''
    train = pd.read_csv(os.path.join(JIGSAW_PATH,'train.csv'), index_col='id')
    test = pd.read_csv(os.path.join(JIGSAW_PATH,'test.csv'), index_col='id')
    y_train = np.where(train['target'] >= 0.5, True, False) * 1
    X_train, X_test, word_index = tokenizer_text(train, test)
    embedding_matrix = build_embeddings(word_index)
    del train,test
    gc.collect()
    return X_train,y_train, X_test, word_index, embedding_matrix

In [None]:
# declar model parameters and embedding related variables
EMB_MAX_FEAT = 300
MAX_LEN = 220
BATCH_SIZE = 512
NUM_EPOCHS = 2
COMMENT_TEXT_COL = 'comment_text'
EMB_PATHS = [
    '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec',
    '../input/glove840b300dtxt/glove.840B.300d.txt'
]
JIGSAW_PATH = '../input/jigsaw-unintended-bias-in-toxicity-classification/'

### CNN Model

In [None]:
def build_model(embedding_matrix, fold_n=0):
    '''
    function to define architecture of CNN based text classification model
    '''
    inp = Input(shape = (MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(inp)
    # Spatial dropout works same function as Dropout, 
    # however it drops entire 1D feature maps instead of individual elements
    x = SpatialDropout1D(rate=0.2)(x)
    # add conv layers
    x = Conv1D(filters=128, kernel_size=2, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=5, padding='same')(x)
    x = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=5, padding='same')(x)
    x = Conv1D(filters=128, kernel_size=4, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=5, padding='same')(x)
    x = Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=5, padding='same')(x)    
    x = Flatten()(x)
    x = Dropout(rate=0.1)(Dense(units=128, activation='relu') (x))
    result = Dense(units=1, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
    return model

In [None]:
def run_model(X, y,X_test, embedding_matrix, word_index):
    '''
        function to run CNN based text classification model. It 
        evaluate the test dataset based on averaged out prediction given
        by 5 fold training.
    '''
    predictions = np.zeros((len(X_test), 1))
    n_fold = 5
    # tensor board callback to store logs
    tensorboard_callback = TensorBoard("logs")
    # early stopping criterion
    early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=3)
    # define stratified K fold 
    folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=11)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        # separate train and validation data
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        # build model
        model = build_model(embedding_matrix, fold_n)
        # model checkpoints to save model at each fold
        file_path = f"best_model_fold_{fold_n}.hdf5"
        check_point = ModelCheckpoint(file_path, monitor="val_loss", verbose=1,save_best_only=True, mode="min")
        # training

        model.fit(
            X_train,y_train,
            batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, verbose=2,
            validation_data=(X_valid, y_valid),
            callbacks=[tensorboard_callback,early_stop,check_point] 
        )

        predictions+=model.predict(X_test, batch_size=2048)
        print(predictions)
        del model
        gc.collect()    
    # average the predictions as per number of folds training   
    preds = predictions/n_fold
    return preds

In [None]:
def submit(sub_preds):
    ''' 
        function to create predictions in acceptable format for jigsaw competetion
    '''
    submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv', index_col='id')
    submission['prediction'] = sub_preds
    submission.reset_index(drop=False, inplace=True)
    submission.to_csv('submission.csv', index=False)

In [None]:
# import data
X_train, y_train,X_test, word_index,embedding_matrix = load_data()
model = build_model(embedding_matrix)
model.summary()

In [None]:
# plot model
plot_model(model, to_file='model_plot.png',show_layer_names=True)

In [None]:
del model
gc.collect()

In [None]:
sub_preds = run_model(X_train, y_train,X_test, embedding_matrix, word_index)
submit(sub_preds)

In [None]:
# Load the extension and start TensorBoard

%load_ext tensorboard
%tensorboard --logdir logs