<a href="https://colab.research.google.com/github/prajwal467/CE888-Assignment2/blob/main/LSTMmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import gensim
import nltk
nltk.download('punkt')
import nltk as nl
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score,f1_score

from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras as keras
from tensorflow.python.keras.models import Sequential
import tensorflow.python.keras.layers as layers

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
path = "https://github.com/prajwal467/CE888-Assignment2/tree/main/datasets/emotion"

In [None]:
# Function to map words to their respective values 
def prepare_text(tokens):
    text = []
    max_len = 0
    for sentence in tokens:
        line = []
        for word in sentence:
            try:
                line.append(w2v[word])
            except exception as e:
                print(e)
                print(word)
                line.append(0)
        max_len = max(max_len,len(line))       
        text.append(line)
    return np.array(text),max_len

In [None]:
#Reading datasets
data_train = pd.read_csv("https://github.com/prajwal467/CE888-Assignment2/blob/main/datasets/emotion/train_text.txt",sep="\t")
data_test = pd.read_csv("https://github.com/prajwal467/CE888-Assignment2/blob/main/datasets/emotion/test_text.txt",sep="\t")
data_val = pd.read_csv("https://github.com/prajwal467/CE888-Assignment2/blob/main/datasets/emotion/val_text.txt",sep="\t")

In [None]:
train_label = pd.read_csv("https://github.com/prajwal467/CE888-Assignment2/blob/main/datasets/emotion/train_labels.txt",sep="\t")
test_label = pd.read_csv("https://github.com/prajwal467/CE888-Assignment2/blob/main/datasets/emotion/test_labels.txt",sep="\t")
val_label = pd.read_csv("https://github.com/prajwal467/CE888-Assignment2/blob/main/datasets/emotion/val_labels.txt",sep="\t")

In [None]:
data_val.shape,val_label.shape

((2416, 1), (2366, 1))

# Word2Vec

In [None]:
# Splitting sentences into words

train_tokens = [nl.word_tokenize(sentences) for sentences in data_train]
test_tokens = [nl.word_tokenize(sentences) for sentences in data_test]
val_tokens = [nl.word_tokenize(sentences) for sentences in data_val]

In [None]:
#training word2vec model on complete vocabulary (train + test vocabulary)

model = gensim.models.Word2Vec(size=250, min_count=1, workers=-1)
model.build_vocab((train_tokens+test_tokens+val_tokens))
model.train((train_tokens+test_tokens+val_tokens),total_examples= len(train_tokens+test_tokens+val_tokens),epochs = 2500)

(0, 0)

In [None]:
# storing trained values of every word [0:100] --> [1] 
w2v = dict(zip(model.wv.index2word, np.mean(model.wv.syn0,axis=1)))

  


In [None]:
# Converting strinf to float values and getting max_len for padding
X_train,max_len = prepare_text(train_tokens)
X_test,max_len = prepare_text(test_tokens)
X_val,max_len = prepare_text(val_tokens)

In [None]:
# Padding to ensure dimensions

X_train = pad_sequences(X_train,dtype='float32',padding='post',maxlen= max_len)
X_test = pad_sequences(X_test,dtype='float32',padding='post',maxlen = max_len )
X_val = pad_sequences(X_val,dtype='float32',padding='post',maxlen = max_len )

In [None]:
# converting labels to one-hot-vector and categorical-vector
oe_enc = OneHotEncoder()

Y_train = oe_enc.fit_transform(np.array(train_label).reshape(-1,1)).toarray()
Y_test = oe_enc.fit_transform(np.array(test_label).reshape(-1,1)).toarray()
Y_val = oe_enc.fit_transform(np.array(val_label).reshape(-1,1)).toarray()

true_label_train = np.argmax(Y_train,axis=1)
true_label_test = np.argmax(Y_test,axis=1)
true_label_val = np.argmax(Y_val,axis=1)

In [None]:
# storing true dimensions
n_len_train,features = X_train.shape
n_len_test,features = X_test.shape
n_len_val,features = X_val.shape

# LSTM

In [None]:
# defining optimizer and callback
optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
es = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=4)

In [None]:
X_train,X_test = X_train.reshape(n_len_train,features,1),X_test.reshape(n_len_test,features,1)
X_val = X_val.reshape(n_len_val,features,1)

In [None]:
model = Sequential(name="LSTM")
model.add(layers.LSTM(16,input_shape=(X_train.shape[1],1),return_sequences=True))
model.add(layers.LSTM(56,return_sequences=True,dropout=0.5))
model.add(layers.LSTM(128))
model.add(layers.Dense(len(np.unique(true_label_train)),activation="softmax"))

In [None]:
model.compile(optimizer = optimizer,loss="categorical_crossentropy",metrics=["accuracy"])

In [None]:
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),callbacks=[es],epochs=50,verbose=1,batch_size=32)

In [None]:
print("F1-SCORE",f1_score(true_label_val,np.argmax(model.predict(X_val),axis=1),average="micro"))