In [None]:
import numpy as np
import pandas as pd
import codecs
import keras
import h5py
import jax
import jax.numpy as jnp #gonna try use jax since its supposedly faster
import time


# Keras model, layer and preprocessing libraries
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Bidirectional
from keras.layers import Dropout
from keras.models import Sequential
from keras.models import model_from_json
from keras.models import load_model
from keras.preprocessing import sequence

# NLTK imports
from nltk.tokenize import RegexpTokenizer

In [None]:
from google.colab import drive
drive.flush_and_unmount()

Drive not mounted, so nothing to flush and unmount.


In [None]:
from google.colab import drive

drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
import os

cwd = os.getcwd()  

In [None]:
# Path to the data
gloveFile = '/content/gdrive/MyDrive/CS4225/ML team/pre-processed data/stanfordSentimentTreebank/glove.6B.100d.txt'

word_embedding_dict = {}
word_index = {}

with codecs.open(gloveFile, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        # The word that is to be the key
        word = values[0]
        # Update word_index
        word_index[word] = len(word_embedding_dict)
        # The vector that represents the word
        vector = np.asarray(values[1:], "float32")
        word_embedding_dict[word] = vector

In [None]:
word_embedding_matrix = np.zeros((len(word_embedding_dict), 100))

i = 0
for word in word_embedding_dict.keys():
    if i > len(word_embedding_matrix):
        break
    
    word_embedding_vector = word_embedding_dict[word]
    if word_embedding_vector is not None:
        word_embedding_matrix[i] = word_embedding_vector
        i = i + 1

In [None]:
# Developing the LSTM Model 
model = Sequential()
# Input_length of 280 is chosen as 280 characters are allowed in twitter
model.add(Embedding(len(word_embedding_matrix), 100, weights = [word_embedding_matrix], input_length = 280, trainable = False))
model.add(Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2)))
model.add(Dense(512, activation = 'relu'))
model.add(Dropout(0.50))
model.add(Dense(3, activation = 'softmax'))

# Adam's optimiser
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 280, 100)          40000000  
                                                                 
 bidirectional (Bidirectiona  (None, 256)              234496    
 l)                                                              
                                                                 
 dense (Dense)               (None, 512)               131584    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 3)                 1539      
                                                                 
Total params: 40,367,619
Trainable params: 367,619
Non-trainable params: 40,000,000
______________________________________

In [None]:
def get_sentiment_sentence(data, word_index):
    # Load the RNN model
    model.load_weights("/content/gdrive/MyDrive/CS4225/ML team/pre-processed data/stanfordSentimentTreebank/RNN_Split_Dataset/RNN.h5")
    print("Model weight loaded successfully")
    
    # Convert keys to lowercase
    word_index_lower = {k.lower(): v for k, v in word_index.items()}
    
    # Preparing the data for prediction
    word_list = np.zeros((1, 280), dtype = 'int32')
    sentiment_analysis = {}
    
    for index, indiv_data in data.iterrows():

        sentence = indiv_data['text']
        #print(indiv_data['text'])
        tokenizer = RegexpTokenizer(r'\w+')
        sentence_words = tokenizer.tokenize(str(sentence))
           
        # Capture all the word into one list for prediction
        i = 0
        for word in sentence_words:
            word_lower = word.lower()
            try:
                word_list[0][i] = word_index_lower[word_lower]
            except Exception as e:
                if str(e) == word:
                    word_list[0][i] = 0
                continue
            i = i + 1
        
        # Predict the score of the tweet 
        score = model.predict(word_list, batch_size = len(data), verbose = 0)
        single_score = np.round(np.argmax(score)/10, decimals=2) # maximum of the array i.e single band
        # weighted score of top 3 bands
        top_3_index = np.argsort(score)[0][-3:]
        top_3_scores = score[0][top_3_index]
        top_3_weights = top_3_scores/np.sum(top_3_scores)
        single_score_dot = np.round(np.dot(top_3_index, top_3_weights)/10, decimals = 2)
        #print(single_score_dot, single_score)
        #sentiment_analysis = sentiment_analysis.append({'Unnamed: 0': index,'Score': single_score_dot}, ignore_index=True)

        sentiment_analysis[str(index)] = single_score_dot


            
    return sentiment_analysis

In [None]:
import glob

# Path to the processed tweets 
csv_path = "/content/gdrive/MyDrive/CS4225/ML team/Sentiment Analysis Results/lexicon/twitter/kaggle_part_1/sentiments/*.csv"

# List of outcomes
outcomes = {}

# Loop through all csvs in the folder
for file in glob.glob(csv_path):
    print("Currently at: " + file)
    t1 = time.perf_counter()

    # Get the sentiments for the csv file
    data = pd.read_csv(file)
    results = get_sentiment_sentence(data, word_index)

    file_name = str(file)[58:-4]
    outcomes[str(file_name)] = results

    t2 = time.perf_counter()
    print('time taken to run:',t2-t1)
    
    # Obtain the values and keys from the dictionary returned from get_sentiment function

In [None]:
path = '/content/gdrive/MyDrive/CS4225/ML team/pre-processed data/stanfordSentimentTreebank/output_kaggle_1.csv'

data_items = outcomes.items()
data_list = list(data_items)

final_df = pd.DataFrame(data_list)

with open(path, 'w', encoding = 'utf-8-sig') as f:
  final_df.to_csv(f)