In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense,Flatten,Embedding,Activation, Dropout
from tensorflow.keras.layers import Dense, Dropout, Embedding, SpatialDropout1D
from tensorflow.keras.layers import GlobalMaxPooling1D 
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional # new! 
from tensorflow.keras.optimizers import Adam

import numpy as np
from numpy import array
import pandas as pd
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,accuracy_score
from sklearn.metrics import auc
import matplotlib.pyplot as plt

from sklearn import model_selection,naive_bayes,svm
from sklearn.linear_model import LogisticRegression
import time

In [2]:
df = pd.read_csv('../dataset/golbeck.csv')


In [3]:
#shuffle the dataset
from sklearn.utils import shuffle
df = shuffle(df)

In [4]:
df.dropna(inplace=True)

In [5]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [6]:
import re

text = ' '.join(df['text'])
text = text.split()
freq_comm = pd.Series(text).value_counts()
rare = freq_comm[freq_comm.values == 1]

def get_clean_text(x):
    if type(x) is str:
        x = x.lower()
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        x = re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x) 
        #regex to remove to emails
        x = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x)
        #regex to remove URLs
        x = re.sub('RT', "", x)
        #substitute the 'RT' retweet tags with empty spaces
        x = re.sub('[^A-Z a-z]+', '', x)
        #combining all the text excluding rare words.
        x = ' '.join([t for t in x.split() if t not in rare])
        return x
    else:
        return x
    
df['text'] = df['text'].apply(lambda x: get_clean_text(x)) 

In [7]:
text = df['text'].tolist()

In [8]:
y = df['label']

In [9]:
token = Tokenizer()
token.fit_on_texts(text)

In [10]:
vocab_size  = len(token.word_index) + 1
vocab_size

1212

In [11]:
encoded_text = token.texts_to_sequences(text)

In [12]:
max_length = 120
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [13]:
X.shape

(20096, 120)

In [14]:
glove_vectors = dict()

In [15]:
file = open('../dataset/glove.6B.300d.txt', encoding='utf-8')

for line in file:
    values = line.split()
    word = values[0]
    #storing the word in the variable
    vectors = np.asarray(values[1: ])
    #storing the vector representation of the respective word in the dictionary
    glove_vectors[word] = vectors
file.close()

In [16]:
len(glove_vectors)

400000

In [17]:
keys = glove_vectors.keys()


In [18]:
word_vector_matrix = np.zeros((vocab_size, 300))

for word, index in token.word_index.items():
    vector = glove_vectors.get(word)
    if vector is not None:
        word_vector_matrix[index] = vector


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 14, test_size = 0.2, stratify = y)

In [20]:
start=time.time()
vec_size = 300

model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length=max_length, weights = [word_vector_matrix], trainable = False))

#model.add(LSTM(128))
model.add(Bidirectional(LSTM(128)))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

#model.add(GlobalMaxPooling1D())

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate = 0.0001), loss = 'binary_crossentropy', metrics = ['accuracy'])

model.fit(np.array(X_train), np.array(y_train), epochs = 5)
end=time.time()
print(end-start)

Train on 16076 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
484.11156249046326


In [21]:
y_predict=model.predict(np.array(X_test))

In [22]:
y_hat=np.where(y_predict > 0.5, 1, 0)

In [23]:
geometric_mean_score(y_test,y_hat)

  return f(**kwargs)


0.5968006701106402

In [24]:
y_pred_keras = model.predict(X_test).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred_keras)

In [25]:
auc_model = auc(fpr_keras, tpr_keras)

In [26]:
f1_score(y_test,y_hat)

0.46696035242290745

In [27]:
accuracy_score(y_test,y_hat)

0.9490096923725242

In [28]:
auc_model

0.8825858285812079

In [29]:
y=y_hat.flatten()

In [30]:
dt=pd.read_csv("founta_label.csv")

In [31]:
result=pd.DataFrame({'text':dt['text'],'label':y})

In [32]:
result.to_csv("results_glove_bilstm.csv")