# Step 1: Loading necessary libraries

In [1]:
import pandas as pd
import numpy as np
import re
import sys
import warnings
from gensim.models.wrappers import FastText
from gensim.models import KeyedVectors
from gensim import models
from sklearn.model_selection import KFold # import KFold
from keras.regularizers import l2
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, concatenate, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, LSTM, Conv1D, Flatten, MaxPooling1D
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
warnings.filterwarnings('ignore')

Using TensorFlow backend.
Using TensorFlow backend.


# Step 2: Loading pretrained word2vec model

In [2]:
model_ug_cbow = FastText.load_fasttext_format('../cc.ne.300.bin/cc.ne.300.bin')

In [3]:
embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = model_ug_cbow.wv[w]#np.append(model_ug_cbow.wv[w],model_ug_sg.wv[w])#
print('Found %s word vectors.' % len(embeddings_index))

Found 584436 word vectors.


# Step 3: Loading Trained word2vec model

In [2]:
model_ug_sg = models.Word2Vec.load('../w2v_model_ug_sg.word2vec')
model_ug_cbow = models.Word2Vec.load('../w2v_model_ug_cbow.word2vec')

In [3]:
embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = model_ug_cbow.wv[w]#
print('Found %s word vectors.' % len(embeddings_index))

Found 42409 word vectors.
Found 42409 word vectors.


# Step 4: Loading dataset

In [4]:
data = pd.read_excel("NepaliEarthquakeTweets_plus_blockade.xlsx",sheetname=0)

In [5]:
pos_indexes = np.where(data['bhawna']=='p')
neg_indexes = np.where(data['bhawna']=='n')
total_size = 600
size = 600
val_size = 600
df_pos = pd.DataFrame.from_items([('text',data['tweet_text'][pos_indexes[0][0:total_size]]),('target',0)])
df_neg = pd.DataFrame.from_items([('text',data['tweet_text'][neg_indexes[0][0:total_size]]),('target',1)])

df_pos['reply'] = data['reply'][pos_indexes[0][0:total_size]]
df_pos['retweet'] = data['retweet'][pos_indexes[0][0:total_size]]
df_pos['likes'] = data['likes'][pos_indexes[0][0:total_size]]

df_neg['reply'] = data['reply'][neg_indexes[0][0:total_size]]
df_neg['retweet'] = data['retweet'][neg_indexes[0][0:total_size]]
df_neg['likes'] = data['likes'][neg_indexes[0][0:total_size]]

df_pos.index = range(len(df_pos.index))
df_neg.index = range(len(df_neg.index))

df_pos_f = df_pos
df_neg_f = df_neg
df_F = df_pos[0:size]
df_F = df_F.append(df_neg[0:size])
df_F.index = range(len(df_F.index))

len(df_F)
validation_df_F= df_pos[500:600]
validation_df_F = validation_df_F.append(df_neg[500:600])
validation_df_F.index = range(len(validation_df_F.index))
#validation_df

In [6]:
df = df_pos[0:size]
df = df.append(df_neg[0:size])
df.index = range(len(df.index))
validation_df= df_pos[500:600]
validation_df = validation_df.append(df_neg[500:600])
validation_df.index = range(len(validation_df.index))

# Step 5: Data Preprocessing: Removing puncuation, english text, decimal

In [7]:
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
pat3 = r'([@#][A-Za-z0-9]+)'
pat4 = r'.[A-Za-z0-9./]+'
pat5 = r'[\,۔،۔”–’‘‘_!…।-]|(")|(:)|(%)|(ः)|(\u200d)|(\xa0…)|(\u200c\u200c)'
combined_pat = r'|'.join((pat1, pat2,pat3,pat4,pat5))
df['text'] = [re.sub(combined_pat, ' ', x) for x in df['text']]
validation_df['text'] = [re.sub(combined_pat, ' ', x) for x in validation_df['text']]
df_pos['text'] = [re.sub(combined_pat, ' ', x) for x in df_pos['text']]
df_neg['text'] = [re.sub(combined_pat, ' ', x) for x in df_neg['text']]

In [8]:
kf = KFold(n_splits=10)
print(kf) 

KFold(n_splits=10, random_state=None, shuffle=False)
KFold(n_splits=10, random_state=None, shuffle=False)


# Step 6: Define Necessary funtions

In [9]:
def get_train_test(df_pos,df_neg,train_index,test_index):
    X_train = df_pos.text[train_index.tolist()]
    X_train = X_train.append(df_neg.text[train_index.tolist()])
    X_train.index = range(len(X_train.index))
    
    y_train = df_pos.target[train_index.tolist()]
    y_train = y_train.append(df_neg.target[train_index.tolist()])
    y_train.index = range(len(y_train.index))
    
    X_test = df_pos.text[test_index.tolist()]
    X_test = X_test.append(df_neg.text[test_index.tolist()])
    X_test.index = range(len(X_test.index))
    
    y_test = df_pos.target[test_index.tolist()]
    y_test = y_test.append(df_neg.target[test_index.tolist()])
    y_test.index = range(len(y_test.index))
    
    return(X_train,y_train,X_test,y_test)

In [10]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
        val_targ = self.model.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        #print(" — val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return

metrics = Metrics()

# Step 7: Constructing CNN-LSTM Model

In [14]:
size = 300
counter = 0
best_test_accuracy = []
after_best_test_accuracy = []
df_pos = df_pos[0:600]
df_neg = df_neg[0:600]
predictedPositive = []
predictedNegative = []
accuracy = []
precision= []
recall = []
roc = [] 
predictedPositive_V = []
predictedNegative_V = []
accuracy_V = []
precision_V= []
recall_V = []
roc_V = [] 
for train_index, test_index in kf.split(df_pos):
    counter = counter +1
    print("=========================================\n")
    print("Fold "+str(counter)+"\n=========================================\n")

    x_train,y_train,x_test,y_test = get_train_test(df_pos,df_neg,train_index,test_index)
    
    x_validation = validation_df['text']
    y_validation = validation_df['target']
      
    tokenizer = Tokenizer(num_words=100000)
    tokenizer.fit_on_texts(x_train)
    sequences = tokenizer.texts_to_sequences(x_train)

    length = []
    for x in x_train:
        length.append(len(x.split()))
    ml = max(length)+10
    
    x_train_seq = pad_sequences(sequences, maxlen=ml)
    
    sequences_val = tokenizer.texts_to_sequences(x_validation)
    x_val_seq = pad_sequences(sequences_val, maxlen=ml)
    
    sequences_test = tokenizer.texts_to_sequences(x_test)
    x_test_seq = pad_sequences(sequences_test, maxlen=ml)
    
    num_words = 100000
    embedding_matrix = np.zeros((num_words, size))
    for word, i in tokenizer.word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    
    #tweet_input = Input(shape=(ml,), dtype='int32')
    tweet_encoder = Embedding(100000, size, weights=[embedding_matrix], input_length=ml, trainable=True)
    
    ######################LSTM##########
    lstm_out = 300
    model = Sequential()
    model.add(tweet_encoder)
    model.add(Conv1D(filters=32, kernel_size=4, activation='relu', padding='causal'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(LSTM(units=lstm_out))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #print(model.summary())
    ####################################
    #model.summary()
    filepath="CNN_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
    
    #accuracy = []
    #class TestCallback(Callback):
        #def on_epoch_end(self, epoch, logs={}):
            #accuracy.append(logs.get('val_acc'))
    test_accuracy = []
    test_loss = []
    class TestCallback(Callback):
        def __init__(self, test_data):
            self.test_data = test_data

        def on_epoch_end(self, epoch, logs={}):
            x, y = self.test_data
            loss, acc = self.model.evaluate(x, y, verbose=0)
            #print('\nacc: {}\n'.format(acc))
            test_accuracy.append(acc)
            test_loss.append(loss)
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    model.fit(x_train_seq, y_train, batch_size=32, epochs=5,
                         validation_data=(x_val_seq, y_validation), callbacks=[TestCallback((x_test_seq, y_test))])
    loss, acc = model.evaluate(x_test_seq, y_test, verbose=0)
    ############################################################################
    predictions = (np.asarray(model.predict(x_val_seq))).round()
    roc_V.append(roc_auc_score(y_validation, predictions))
    accuracy_V.append(accuracy_score(y_validation, predictions, normalize=True))
    precision_V.append(precision_score(y_validation, predictions))
    recall_V.append(recall_score(y_validation, predictions))
    result = pd.DataFrame.from_items([("Actual",y_validation),("Prediction",predictions.tolist())])
    print("Validation Confustion Matrix")
    cm = confusion_matrix(y_validation, predictions)
    print(cm)
    pp = [result['Actual'][x] for x in range(0,len(result)) if ((result['Actual'][x] ==result['Prediction'][x]) and (result['Actual'][x]==0))]
    nn = [result['Actual'][x] for x in range(0,len(result)) if ((result['Actual'][x] ==result['Prediction'][x]) and (result['Actual'][x]==1))]
    predictedPositive_V.append(len(pp))
    predictedNegative_V.append(len(nn))
    ##############################################################################
    ############################################################################
    predictions = (np.asarray(model.predict(x_test_seq))).round()
    roc.append(roc_auc_score(y_test, predictions))
    accuracy.append(accuracy_score(y_test, predictions, normalize=True))
    precision.append(precision_score(y_test, predictions))
    recall.append(recall_score(y_test, predictions))
    result = pd.DataFrame.from_items([("Actual",y_test),("Prediction",predictions.tolist())])
    print("Test Confustion Matrix")
    cm = confusion_matrix(y_test, predictions)
    print(cm)
    pp = [result['Actual'][x] for x in range(0,len(result)) if ((result['Actual'][x] ==result['Prediction'][x]) and (result['Actual'][x]==0))]
    nn = [result['Actual'][x] for x in range(0,len(result)) if ((result['Actual'][x] ==result['Prediction'][x]) and (result['Actual'][x]==1))]
    predictedPositive.append(len(pp))
    predictedNegative.append(len(nn))
    ##############################################################################
    #print('\nAfter 5 epoch Testing loss: {}, acc: {}\n'.format(loss, acc))
    after_best_test_accuracy.append(acc)
    best_test_accuracy.append(max(test_accuracy))    
    

#######################################################################
'''print("##################################################################")
print("Validation Results")
print("##################################################################")
a = sum(accuracy_V)/10
p = sum(precision_V)/10
r = sum(recall_V)/10
ro = sum(roc_V)/10
print("Accuracy: ",a,"\tPrecision: ",p,"\tRecall: ",r,"\tROC: ",ro)
TP = sum(predictedPositive_V)/10
FP = 100-TP
TN = sum(predictedNegative_V)/10
FN = 100-TN
print("Actual Positive: ",len(df_pos),"\tPredictedPositive(TP): ",TP,"\tFP: ",FP)
print("Actual Negative: ",len(df_neg),"\tPredictedNegative(TN): ",TN,"\tFN: ",FN)
F_measure = (2 * p * r)/(p + r)
print("F-measure: "+str(F_measure))'''
##############################################################################
#######################################################################
print("##################################################################")
print("Test Results")
print("##################################################################")
a = sum(accuracy)/10
p = sum(precision)/10
r = sum(recall)/10
ro = sum(roc)/10
print("Accuracy: ",a,"\tPrecision: ",p,"\tRecall: ",r,"\tROC: ",ro)
TP = sum(predictedPositive)
FP = (len(df_pos)-sum(predictedPositive))
TN = sum(predictedNegative)
FN = (len(df_neg)-sum(predictedNegative))
print("Actual Positive: ",len(df_pos),"\tPredictedPositive(TP): ",TP,"\tFP: ",FP)
print("Actual Negative: ",len(df_neg),"\tPredictedNegative(TN): ",TN,"\tFN: ",FN)
F_measure = (2 * p * r)/(p + r)
print("F-measure: "+str(F_measure))
##############################################################################


Fold 1


Fold 1

Train on 1080 samples, validate on 200 samples
Epoch 1/5
Train on 1080 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 2/5
Epoch 3/5
Epoch 3/5
Epoch 4/5
Epoch 4/5
Epoch 5/5
Epoch 5/5
Validation Confustion Matrix
[[ 97   3]
 [  0 100]]
Test Confustion Matrix
[[30 30]
 [ 9 51]]

Fold 2

Validation Confustion Matrix
[[ 97   3]
 [  0 100]]
Test Confustion Matrix
[[30 30]
 [ 9 51]]

Fold 2

Train on 1080 samples, validate on 200 samples
Epoch 1/5
Train on 1080 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 2/5
Epoch 3/5
Epoch 3/5
Epoch 4/5
Epoch 4/5
Epoch 5/5
Epoch 5/5
Validation Confustion Matrix
[[ 99   1]
 [  0 100]]
Test Confustion Matrix
[[36 24]
 [21 39]]

Fold 3

Validation Confustion Matrix
[[ 99   1]
 [  0 100]]
Test Confustion Matrix
[[36 24]
 [21 39]]

Fold 3

Train on 1080 samples, validate on 200 samples
Epoch 1/5
Train on 1080 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 2/5
Epoch 3/5
Epoch 3/5
Epoch 4/5
Epoch 4/5
Epoch 5/

Epoch 2/5
Epoch 2/5
Epoch 3/5
Epoch 3/5
Epoch 4/5
Epoch 4/5
Epoch 5/5
Epoch 5/5
Validation Confustion Matrix
[[ 95   5]
 [  0 100]]
Test Confustion Matrix
[[39 21]
 [22 38]]

Fold 6

Validation Confustion Matrix
[[ 95   5]
 [  0 100]]
Test Confustion Matrix
[[39 21]
 [22 38]]

Fold 6

Train on 1080 samples, validate on 200 samples
Epoch 1/5
Train on 1080 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 2/5
Epoch 3/5
Epoch 3/5
Epoch 4/5
Epoch 4/5
Epoch 5/5
Epoch 5/5
Validation Confustion Matrix
[[99  1]
 [ 2 98]]
Test Confustion Matrix
[[53  7]
 [22 38]]

Fold 7

Validation Confustion Matrix
[[99  1]
 [ 2 98]]
Test Confustion Matrix
[[53  7]
 [22 38]]

Fold 7

Train on 1080 samples, validate on 200 samples
Epoch 1/5
Train on 1080 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 2/5
Epoch 3/5
Epoch 3/5
Epoch 4/5
Epoch 4/5
Epoch 5/5
Epoch 5/5
Validation Confustion Matrix
[[ 98   2]
 [  0 100]]
Test Confustion Matrix
[[45 15]
 [22 38]]

Fold 8

Validation Confustion Mat

Epoch 4/5
Epoch 4/5
Epoch 5/5
Epoch 5/5
Validation Confustion Matrix
[[82 18]
 [13 87]]
Test Confustion Matrix
[[34 26]
 [16 44]]

Fold 10

Validation Confustion Matrix
[[82 18]
 [13 87]]
Test Confustion Matrix
[[34 26]
 [16 44]]

Fold 10

Train on 1080 samples, validate on 200 samples
Epoch 1/5
Train on 1080 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 2/5
Epoch 3/5
Epoch 3/5
Epoch 4/5
Epoch 4/5
Epoch 5/5
Epoch 5/5
Validation Confustion Matrix
[[65 35]
 [10 90]]
Test Confustion Matrix
[[25 35]
 [10 50]]
##################################################################
Test Results
##################################################################
Accuracy:  0.6799999999999999 	Precision:  0.6787595631869413 	Recall:  0.7150000000000001 	ROC:  0.6799999999999999
Actual Positive:  600 	PredictedPositive(TP):  387 	FP:  213
Actual Negative:  600 	PredictedNegative(TN):  429 	FN:  171
F-measure: 0.696408620966096
Validation Confustion Matrix
[[65 35]
 [10 90]]
Test Confusti

# Result of CNN-LSTM + Trained Word2Vec

# Result of CNN-LSTM + Pretrained word2vec