In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import pandas as pd
import numpy as np

# text preprocessing
from nltk.tokenize import word_tokenize
import re

# plots and metrics
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# preparing input to our model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# keras layers
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, GRU, Dense

In [None]:
num_classes = 2

# Number of dimensions for word embedding
embed_num_dims = 300

# Max input length (max number of words) 
max_seq_len = 47

class_names = ['zero', 'one']

In [None]:
#importing the data file
total_data = pd.read_csv("preprocessed_data_file.csv", encoding="ISO-8859-1")
total_data['text']=total_data['text'].apply(str)
total_data = total_data.dropna()
total_data.head()

In [None]:
#splitting the data in test and train set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(total_data['final_text'],total_data['subjectivity'], test_size = 0.20, random_state = 42)

In [None]:
#converting the text data to sequence 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(total_data['final_text'])


sequence_train = tokenizer.texts_to_sequences(X_train)
sequence_test = tokenizer.texts_to_sequences(X_test)

index_of_words = tokenizer.word_index

# vacab size is number of unique words + reserved 0 index for padding
vocab_size = len(index_of_words) + 1

print('Number of unique words: {}'.format(len(index_of_words)))

In [None]:
import pickle
# saving the text tokenizer 
with open('subjectivity_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
##For full data
seq = tokenizer.texts_to_sequences(total_data['final_text'])
data_pad = pad_sequences(seq, maxlen = max_seq_len )
data_pad

In [None]:
#padding sequence which are less then the max length given
X_train_pad = pad_sequences(sequence_train, maxlen = max_seq_len ) ##post padding
X_test_pad = pad_sequences(sequence_test, maxlen = max_seq_len )

In [None]:
#Funtion to create an embedding matrix which will contain each word and its respective vector representation 
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix

In [None]:
#Glove file with vector dimenion of 300
fname = 'glove.6B.300d.txt'

In [None]:
#creating embedding matrix
embedd_matrix = create_embedding_matrix(fname, index_of_words, embed_num_dims)
embedd_matrix.shape

In [None]:
# Inspect unseen words
new_words = 0

for word in index_of_words:
    entry = embedd_matrix[index_of_words[word]]
    if all(v == 0 for v in entry):
        new_words = new_words + 1

print('Words found in wiki vocab: ' + str(len(index_of_words) - new_words))
print('New words found: ' + str(new_words))

In [None]:
# Embedding layer before the actaul BLSTM 
embedd_layer = Embedding(vocab_size,
                         embed_num_dims,
                         input_length = max_seq_len,
                         weights = [embedd_matrix],
                         trainable=False)

In [None]:
#MODEL architecture
# Parameters
lstm_output_size = 128
bidirectional = True

# Embedding Layer, LSTM or biLSTM, Dense, softmax
model = Sequential()
model.add(embedd_layer)

if bidirectional:
    model.add(Bidirectional(LSTM(units=lstm_output_size,return_sequences=True)))
                             
else:
     model.add(LSTM(units=lstm_output_size,return_sequences=True))
               
if bidirectional:
    model.add(Bidirectional(LSTM(units=lstm_output_size,dropout=0.2,recurrent_dropout=0.2)))
else:
    model.add(LSTM(units=lstm_output_size,dropout=0.2,recurrent_dropout=0.2))

#model.add(Dense(num_classes, activation='softmax'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
#model compilation
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

In [None]:
#training
batch_size = 1000
epochs = 20

hist = model.fit(X_train_pad, y_train, 
                 batch_size=batch_size,
                 epochs=epochs,
                 validation_data=(X_test_pad,y_test))

In [None]:
#predictions
predictions = model.predict(data_pad)

In [None]:
#Saving the model
from keras.models import load_model
model.save('subjectivity_model.h5')