#Word2Vec+BiLSTM

In [None]:
import re
import pickle
import numpy as np
import pandas as pd

import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Dense, LSTM, Conv1D, Embedding

from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [None]:
import json
from google.colab import files
uploaded = files.upload()

In [None]:
#reading train.jsonl file
df1 = pd.read_json('train.jsonl', lines = True)
df1.head(10)

In [None]:
#reading dev_seen.jsonl file
df2 = pd.read_json("dev_seen.jsonl", lines=True)

#concate the train adn dev_seen data
concated = pd.concat([df1, df2])
concated.head(10)

In [None]:
concated_dt = concated[['label','text']]

#sorting the rows by label
concated_dt = concated_dt.sort_values(by=['label'])

In [None]:
ax = concated_dt.groupby('label').count().plot(kind = 'bar', title='Distribution of data', legend=False)
ax = ax.set_xticklabels(['Negative', 'Positive'], rotation=0)

In [None]:
concated_dt.shape

In [None]:
##### CLEANING THE DATA #####

import re
import string
def cleaning(text):        
    # converting to lowercase, removing URL links, special characters, punctuations...
    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[’“”…]', '', text)     
    # removing the emojies              
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)   
    
    # removing the stop-words          
    text_tokens = word_tokenize(text)
    stop_words = stopwords.words()
    tokens_without_sw = [word for word in text_tokens if not word in stop_words]
    filtered_sentence = (" ").join(tokens_without_sw)
    text = filtered_sentence
    
    return text
concated_dt['processed_text'] = concated_dt['text'].apply(cleaning)
print(concated_dt['processed_text'])

In [None]:
#printing processed text with the corresponding original text
count = 0
for row in concated_dt.itertuples():
  print("Text:", row[2])
  print("Processed:", row[3])
  count+=1
  if count>10:
    break

In [None]:
#splitting up the data into training and testing
X_data, y_data = np.array(concated_dt['processed_text']), np.array(concated_dt['label'])
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.05, random_state=0)

In [None]:
from gensim.models import Word2Vec

Embedding_dimenstions = 100

Word2Vec_train_data = list(map(lambda x: x.split(), X_train))

In [None]:
#word2vec model
word_model = Word2Vec(Word2Vec_train_data, size=Embedding_dimenstions, workers=8, min_count=5)

print("Vocabulary Length:", len(word_model.wv.vocab))

In [None]:
input_length = 60

In [None]:
#Tokenizing the sequence
tokenizer = Tokenizer(filters="", lower=False, oov_token="<oov>")
tokenizer.fit_on_texts(X_data)

vocab_length = len(tokenizer.word_index) + 1
print("Tokenizer vocab length:", vocab_length)

In [None]:
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=input_length)
X_test  = pad_sequences(tokenizer.texts_to_sequences(X_test) , maxlen=input_length)

print("X_train.shape:", X_train.shape)
print("X_test.shape :", X_test.shape)

In [None]:
#Creating embedding matrix using Word2Vec model
embedding_matrix = np.zeros((vocab_length, Embedding_dimenstions))

for word, token in tokenizer.word_index.items():
    if word_model.wv.__contains__(word):
        embedding_matrix[token] = word_model.wv.__getitem__(word)

print("Embedding Matrix Shape:", embedding_matrix.shape)

In [None]:
#defining the model
def getModel():
    embedding_layer = Embedding(input_dim = vocab_length, 
                                output_dim = Embedding_dimenstions,
                                weights=[embedding_matrix], 
                                input_length=input_length,
                                trainable=False)

    model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        GlobalMaxPool1D(),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid'),
    ],
    name="Sentiment_Model")
    return model

In [None]:
training_model = getModel()

In [None]:
training_model.summary()

In [None]:
callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
             EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

In [None]:
training_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#training the model
history = training_model.fit(
    X_train, y_train,
    batch_size=500,
    epochs=100,
    validation_split=0.1,
    verbose=1,
)

In [None]:
#training v/s validation accuracy graph
acc,  val_acc  = history.history['accuracy'], history.history['val_accuracy']
loss, val_loss = history.history['loss'], history.history['val_loss']
epochs = range(len(acc))

plt.plot(epochs, acc, label='Training acc')
plt.plot(epochs, val_acc, label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, label='Trianing loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.title("trainign and validation loss")
plt.legend()

plt.show()

In [None]:
def ConfusionMatrix(y_pred, y_test):
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    categories  = ['Negative','Positive']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
                xticklabels = categories, yticklabels = categories)

    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

In [None]:
# Predicting on the Test dataset.
y_pred = training_model.predict(X_test)

# Converting prediction to reflect the sentiment predicted.
y_pred = np.where(y_pred>=0.5, 1, 0)

# Printing out the Evaluation metrics. 
ConfusionMatrix(y_pred, y_test)

In [None]:
print(classification_report(y_test, y_pred))