Importing the Text Processing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import sklearn
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
from nltk.tokenize import TweetTokenizer
import gensim
from gensim.models import Word2Vec
import re
import random

Loading the Tweets Data

In [None]:
path='../input/nlp-getting-started'
train=pd.read_csv(path+'/train.csv')

Checking the type of training data provided

In [None]:
train.info()

Checking for null values in the train data

In [None]:
train.isnull().sum()/len(train)

Since, here Twitter Data is being used, the glove embeddings for twitter will be used. Loading the path for the embeddings

In [None]:
glove_loc='../input/d/fullmetal26/glovetwitter27b100dtxt/glove.twitter.27B.200d.txt'

Designing Custom Tokenizer which has special tokens like 'user','number','hashtag','allcaps' and 'repeat'which replaces many of words which are specific like username, url link, hashtag before a word.

In [None]:
def custom_tokenize(seq):
    url_regex=r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    #replaces all the links with url
    seq=re.sub(url_regex,'<URL><sp>',seq) 
    
    # removes words which cannot be encoded
    seq=re.sub('[^\x00-\x7F]+.','',seq)
    
    # replaces all the numbers with a general token NUMBER
    seq=re.sub('[0-9]+','<NUMBER><sp>',seq)
    
    # Replaces all the usernames with generalized token USER
    seq=re.sub('@\w+', "<USER><sp>",seq)
    
    #Replaces the hashtags before words with token HASHTAG
    hash_words=re.findall('#[A-z]{1,}',seq)
    for i in hash_words:
        seq=re.sub(i,'<HASHTAG><sp>'+i.split('#')[1],seq)
    reg_split=r'\s|<sp>'
    seq=re.split(reg_split,seq)
    
    
    for i,j in enumerate(seq):

        non_cap=re.findall('[A-Z]{1}[a-z]+',j)
        #replaces all the capital letter words with this ALLCAPS token followed with 
        #the original words
        if len(non_cap)>0:
            cap_in_noncap=re.split(non_cap[0],j)
            if len(non_cap[0])//len(j)==0:
                if len(cap_in_noncap)>0:
                    #replaces all the capital letter words with this ALLCAPS token followed with 
                    #the original words
                    seq[i]=' '.join([cap_in_noncap[0].lower(),'<ALLCAPS>',non_cap[0].lower()])
        
        all_cap=re.findall('^([A-Z]{2,}[^a-z]+)$',j)
        if len(all_cap)>0:
            seq[i]=re.findall('[^(.,`;@_!#$%^&*()<>?/|}{~:\'\-)]+',j)[0].lower()+' <ALLCAPS>'


        repeat=re.findall('[.,!?:\'\-]',j)
        
        # Repeating special characters will be replaced with this token REPEAT
        
        if len(repeat)>1:
            seq[i]=' '.join([seq[i],repeat[0]+' <REPEAT>'])
        else:
            if repeat:
                seq[i]=' '.join([re.split('[.,!?:\'\-]',j)[0],re.findall('[.,!?:\'\-]',j)[0]])
        
        spe_words=re.split('[.,`;_!$%^&*()?/|}{~:\'\-]',j)
        expre=re.split('[^(.,`;_!$%^&*()?/|}{~:)\'\-]',j) 
        
        if len(spe_words)>2:
            temp1=[z for z in spe_words if z]
            temp2=[z for z in expre if z][-1]
            temp3=re.findall('[.,`;_!$%^&*()?/|}{~:\'\-]',temp2)
            if len(temp3)>1:
                seq[i]=' '.join(temp1+[temp3[0],'<REPEAT>'])
        
        exp=re.findall('[.,`;_!$%^&*()?/|}{~:\'\-]',j)
        if len(exp)==1:
            seq[i]=re.sub('[.,`;_!$%^&*()?/|}{~:\'\-]',' '+exp[0]+' ',j)
    seq=' '.join(seq)
    seq=seq.split(' ')
    return [i.lower() for i in seq if i]

Function for converting the twitter data into numerical embeddings based on glovetwitter

In [None]:
def text_processing(seq,glove_loc,len_vec=200):
    stpwds_set=set(stopwords.words('english'))
    
    embeddings_dict = {}
    with open(glove_loc, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    
    seq=seq.apply(lambda x:custom_tokenize(x))
    seq=seq.apply(lambda x:[i for i in x if i not in stpwds_set])
    for i,j in enumerate(seq):
        temp=[]
        for l,k in enumerate(j):
            
            try:
                temp.append(embeddings_dict[k])
            except(Exception):
                continue
        seq[i]=temp
    return seq

Converting the twitter text data into numerical embeddings

In [None]:
train_data=text_processing(train.text,glove_loc)
train_data=pd.DataFrame(train_data).join(train.target)

Checking the data for any class imbalance

In [None]:
train_data.target.value_counts()/len(train_data)

Loading the Test data

In [None]:
test=pd.read_csv(path+'/test.csv')
test_data=pd.DataFrame(text_processing(test.text,glove_loc))

Calculating maximum sequence length among all the tweets

In [None]:
max_seq=train_data.text.apply(lambda x:len(x)).max()
max_seq

In [None]:
min_seq=train_data.text.apply(lambda x:len(x)).min()
min_seq

Importing the keras modules

In [None]:
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense,LSTM,BatchNormalization,Dropout,Conv1D, MaxPooling1D, Activation, Flatten,Bidirectional
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
import tensorflow as tf

In [None]:
def tweet_gen(data,batch_size,max_seq,min_seq,aug):
    
    while True:
        data=np.random.permutation(data)
        num_batches = data.shape[0]//batch_size
        rem_d=data.shape[0]%batch_size
        
        for batch in range(num_batches): # we iterate over the number of batches
            if aug==True:
                maxlen=random.choice(range(min_seq,max_seq))
                padding=random.choice(['pre','post'])
                truncating=random.choice(['pre','post'])
                temp=tf.keras.preprocessing.sequence.pad_sequences(data[batch*batch_size:(batch+1)*batch_size][:,0], maxlen=maxlen, dtype='float32', padding=padding,truncating=truncating, value=0.0)
                yield tf.keras.preprocessing.sequence.pad_sequences(temp, maxlen=max_seq, dtype='float32', padding='pre',truncating='pre', value=0.0),data[batch*batch_size:(batch+1)*batch_size][:,1].astype('float32')
            else:
                yield tf.keras.preprocessing.sequence.pad_sequences(data[batch*batch_size:(batch+1)*batch_size][:,0], maxlen=max_seq, dtype='float32', padding='pre',truncating='pre', value=0.0),data[batch*batch_size:(batch+1)*batch_size][:,1].astype('float32')
    
        if rem_d!=0:
            fm=num_batches*batch_size
            to=fm+rem_d
            if aug==True:
                maxlen=random.choice(range(min_seq,max_seq))
                padding=random.choice(['pre','post'])
                truncating=random.choice(['pre','post'])
                
                temp=tf.keras.preprocessing.sequence.pad_sequences(data[fm:to][:,0], maxlen=maxlen, dtype='float32', padding=padding,truncating=truncating, value=0.0)
                yield tf.keras.preprocessing.sequence.pad_sequences(temp, maxlen=max_seq, dtype='float32', padding='pre',truncating='pre', value=0.0),data[fm:to][:,1].astype('float32')
            else:
                yield tf.keras.preprocessing.sequence.pad_sequences(data[fm:to][:,0], maxlen=max_seq, dtype='float32', padding='pre',truncating='pre', value=0.0),data[fm:to][:,1].astype('float32')

In [None]:
batch_size=8

train_gen=tweet_gen(data=train_data.iloc[:int(len(train_data)*0.8),:],batch_size=batch_size,max_seq=max_seq,min_seq=min_seq,aug=True)
val_gen=tweet_gen(data=train_data.iloc[int(len(train_data)*0.8):,:],batch_size=batch_size,max_seq=max_seq,min_seq=min_seq,aug=False)

Building  the ANN model based on 1 D Convolutional Neural Network and LSTM

In [None]:
def plot(history):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,4))
    axes[0].plot(history.history['loss'])   
    axes[0].plot(history.history['val_loss'])
    axes[0].legend(['loss','val_loss'])

    axes[1].plot(history.history['accuracy'])   
    axes[1].plot(history.history['val_accuracy'])
    axes[1].legend(['accuracy','val_accuracy'])

In [None]:
model = Sequential()
model.add(keras.layers.InputLayer(input_shape=(max_seq,200)))


model.add(Conv1D(64, 5, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=4))

model.add(Conv1D(128, 5, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=4))

model.add(tf.keras.layers.Bidirectional(LSTM(256)))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Function for plotting the loss and accuracies

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min',verbose=1, patience=20)
#mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
num_epochs=100
num_train_sequences=int(len(train_data)*0.8)
num_val_sequences=len(train_data)-int(len(train_data)*0.8)

if (num_train_sequences%batch_size) == 0:
    steps_per_epoch = int(num_train_sequences/batch_size)
else:
    steps_per_epoch = (num_train_sequences//batch_size) + 1

if (num_val_sequences%batch_size) == 0:
    validation_steps = int(num_val_sequences/batch_size)
else:
    validation_steps = (num_val_sequences//batch_size) + 1

history1=model.fit(train_gen, steps_per_epoch=steps_per_epoch,callbacks=es, epochs=num_epochs, verbose=1, validation_data=val_gen,validation_steps=validation_steps, class_weight=None, workers=1, initial_epoch=0)
plot(history1)

In [None]:
test_data=tf.keras.preprocessing.sequence.pad_sequences(test_data.text, maxlen=max_seq, dtype='float32', padding='pre',truncating='pre', value=0.0)

In [None]:
pre_var=model.predict(test_data)

In [None]:
submission=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
submission.target=pre_var

In [None]:
submission['target']=submission.target.apply(lambda x:1 if x>0.5 else 0)

In [None]:
submission.to_csv('submission.csv', index=False)