In [1]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2021-05-14 10:19:31--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-05-14 10:19:31--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-05-14 10:19:31--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [2]:
!unzip -q glove.6B.zip

In [3]:
import pandas as pd
import numpy as np

# text preprocessing
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# plots and metrics
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# preparing input to our model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# keras layers
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, GRU, Dense, Flatten, Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.normalization import BatchNormalization

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
from google.colab import files
upload = files.upload()

Saving preprocessed8k.csv to preprocessed8k.csv


In [6]:
num_classes = 3

# Number of dimensions for word embedding
embed_num_dims = 300

# Max input length (max number of words) 
max_seq_len = 47

class_names = ['Negative', 'Neutral', 'Positive']

In [7]:
#importing the data file
total_data = pd.read_csv("preprocessed_data_file.csv", encoding="ISO-8859-1")
total_data['text']=total_data['text'].apply(str)
total_data = total_data.dropna()
total_data.head()

Unnamed: 0,srno,text,polarity,subjectivity,final_text
0,0,one funny thing will say that steven has made ...,0.0,0.0,funny thing will say that has made lot money p...
1,1,if we are trading lonzo ball steven we need tr...,0.0,0.3,if we are trading we need or nothing less
2,2,they had everything lonzo bi zion future big t...,-0.3,0.6,they had everything future big they went signe...
3,3,no cap lonzo amp steven would be massive upgra...,0.2,0.7,no cap lonzo amp steven would be massive upgra...
4,4,legion hoops scammers stealing my trades but q...,-0.2,0.3,legion hoops scammers stealing my trades but q...


In [8]:
#splitting the data in test and train set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(total_data['final_text'],total_data['polarity'], test_size = 0.20, random_state = 42)

In [9]:
#converting the text data to sequence 

tokenizer = Tokenizer()
tokenizer.fit_on_texts(total_data['final_text'])

sequence_train = tokenizer.texts_to_sequences(X_train)
sequence_test = tokenizer.texts_to_sequences(X_test)

index_of_words = tokenizer.word_index

# vacab size is number of unique words + reserved 0 index for padding
vocab_size = len(index_of_words) + 1

print('Number of unique words: {}'.format(len(index_of_words)))

Number of unique words: 7967


In [27]:
import pickle
# saving the text tokenizer 
with open('polarity_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
##For full data
seq = tokenizer.texts_to_sequences(total_data['final_text'])
data_pad = pad_sequences(seq, maxlen = max_seq_len )
data_pad

array([[   0,    0,    0, ...,  104,  351,   65],
       [   0,    0,    0, ...,   30,  185,  522],
       [   0,    0,    0, ...,   41,  882, 4270],
       ...,
       [   0,    0,    0, ...,   14,   18,  200],
       [   0,    0,    0, ...,   21,   39,  727],
       [   0,    0,    0, ...,  104,  351,   65]], dtype=int32)

In [11]:
X_train_pad = pad_sequences(sequence_train, maxlen = max_seq_len ) ##post padding
X_test_pad = pad_sequences(sequence_test, maxlen = max_seq_len )

In [12]:
#Funtion to create an embedding matrix which will contain each word and its respective vector representation 
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix

In [13]:
#Glove file with vector dimenion of 300
fname = 'glove.6B.300d.txt'

In [15]:
#creating embedding matrix
embedd_matrix = create_embedding_matrix(fname, index_of_words, embed_num_dims)
embedd_matrix.shape

(7968, 300)

In [16]:
# Inspect unseen words
new_words = 0

for word in index_of_words:
    entry = embedd_matrix[index_of_words[word]]
    if all(v == 0 for v in entry):
        new_words = new_words + 1

print('Words found in wiki vocab: ' + str(len(index_of_words) - new_words))
print('New words found: ' + str(new_words))

Words found in wiki vocab: 7330
New words found: 637


In [18]:
# Embedding layer before the actaul BLSTM 
embedd_layer = Embedding(vocab_size,
                         embed_num_dims,
                         input_length = max_seq_len,
                         weights = [embedd_matrix],
                         trainable=False)

In [20]:
#MODEL architecture
# Parameters
lstm_output_size = 128
bidirectional = True

# Embedding Layer, LSTM or biLSTM, Dense, softmax
model = Sequential()
model.add(embedd_layer)

if bidirectional:
    model.add(Bidirectional(LSTM(units=lstm_output_size,return_sequences=True)))                    
else:
     model.add(LSTM(units=lstm_output_size,return_sequences=True, dropout=0.2,recurrent_dropout=0.2 ))
if bidirectional:
    model.add(Bidirectional(LSTM(units=lstm_output_size)))
else:

    model.add(LSTM(units=32,dropout=0.2,recurrent_dropout=0.2))

#model.add(Dense(num_classes, activation='softmax'))
model.add(Dense(1, activation='tanh'))

In [21]:
#model compilation
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 47, 300)           2390400   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 47, 256)           439296    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 3,224,193
Trainable params: 833,793
Non-trainable params: 2,390,400
_________________________________________________________________


In [None]:
#training
batch_size = 1000
epochs = 20

hist = model.fit(X_train_pad, y_train, 
                 batch_size=batch_size,
                 epochs=epochs,
                 validation_data=(X_test_pad,y_test))

In [23]:
#predictions
predictions = model.predict(data_pad)

In [25]:
#Saving the model
from keras.models import load_model
model.save('polarity_model.h5')