In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import InputLayer, LSTM, Dropout, Dense, Embedding ,Bidirectional, Conv1D, GlobalAveragePooling1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Num GPUs Available:  1


In [3]:
d = pd.read_csv('C:\\Users\\prtkm\\Jupyter_Notebooks\\senti_analysis\\Datasets\\FinalTrainingOnly.tsv', sep="\t", header=None )

x=d[1]
y=d[2]
y=list(y)
x=list(x)

for i in range(len(x)):
    if type(x[i])==float:
        x[i]=" "
        y[i]= 1

y=np.array(y)

In [4]:
v = pd.read_csv('C:\\Users\\prtkm\\Jupyter_Notebooks\\senti_analysis\\Datasets\\ValidationOnly.tsv', sep="\t", header=None )

vx=v[1]
vy=v[2]
vy=list(vy)
vx=list(vx)

for i in range(len(vx)):
    if type(vx[i])==float:
        vx[i]=" "
        vy[i]= 1

vy=np.array(vy)

In [5]:
t = pd.read_csv('C:\\Users\\prtkm\\Jupyter_Notebooks\\senti_analysis\\Datasets\\FinalTest.tsv', sep="\t", header=None )
tx=list(t[1])
for i in range(len(tx)):
    if type(tx[i])==float:
        tx[i]=" "

In [6]:
ty = pd.read_csv('C:\\Users\\prtkm\\Jupyter_Notebooks\\senti_analysis\\Datasets\\Ty.txt', sep=",", header=None )
sentiment = []
for i in range(len(ty)):
    if ty[1][i]=='negative':
        sentiment.append(0)
    elif ty[1][i]=='neutral':
        sentiment.append(1)
    elif ty[1][i]=='positive':
        sentiment.append(2)

ty = np.array(sentiment)

In [7]:
tokenizer = Tokenizer(num_words=2500,split=' ')
tokenizer.fit_on_texts(x)

X= tokenizer.texts_to_sequences(x)
X = pad_sequences(X,maxlen=50)

VX= tokenizer.texts_to_sequences(vx)
VX = pad_sequences(VX,maxlen=50)


TX = tokenizer.texts_to_sequences(tx)
TX = pad_sequences(TX,maxlen=50)

In [8]:
vocab_size = 20000
embed_size = 128

In [9]:
# Model no 1 

bimodel = Sequential()
bimodel.add(InputLayer(input_shape=X.shape[1]))
bimodel.add(Embedding(vocab_size, embed_size))
bimodel.add(Bidirectional(LSTM(128)))
bimodel.add(Dense(64, activation='relu'))
bimodel.add(Dense(32, activation='relu'))
bimodel.add(Dense(3, activation='softmax'))

bimodel.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])

bimodel.summary()

history1 = bimodel.fit(X, y, epochs=10, batch_size=128,verbose=1,validation_data=(VX,vy))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 128)           2560000   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 99        
Total params: 2,841,795
Trainable params: 2,841,795
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
pred = bimodel.predict(TX)
pred=np.argmax(pred,axis=1)
test_acc = accuracy_score(ty,pred)
print("Test accuracy of BiLSTM model = " + str(test_acc) )

Test accuracy of BiLSTM model = 0.586


In [11]:
# Model no 2 

clmodel = Sequential()
clmodel.add(InputLayer(input_shape=X.shape[1]))
clmodel.add(Embedding(vocab_size, embed_size))
clmodel.add(Conv1D(64, kernel_size=3))
clmodel.add(GlobalAveragePooling1D())
clmodel.add(Dense(64, activation='relu'))
clmodel.add(Dense(3, activation='softmax'))

clmodel.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])

clmodel.summary()

history2 = clmodel.fit(X, y, epochs=10, batch_size=128,verbose=1,validation_data=(VX,vy))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 128)           2560000   
_________________________________________________________________
conv1d (Conv1D)              (None, 48, 64)            24640     
_________________________________________________________________
global_average_pooling1d (Gl (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 195       
Total params: 2,588,995
Trainable params: 2,588,995
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [12]:
pred = clmodel.predict(TX)
pred=np.argmax(pred,axis=1)
test_acc = accuracy_score(ty,pred)
print("Test accuracy of singular CNN model = " + str(test_acc) )

Test accuracy of singular CNN model = 0.618


In [13]:
# Model no 3

model = Sequential()
model.add(InputLayer(input_shape=X.shape[1]))
model.add(Embedding(vocab_size, embed_size))
model.add(LSTM(units=264, activation='tanh'))
model.add(Dense(units=64,activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(units=3, activation='softmax'))

model.compile(optimizer=tf.keras.optimizers.Adam(), loss="sparse_categorical_crossentropy", metrics = ['accuracy'])

model.summary()

history3 = model.fit(X, y, epochs=7, batch_size=128,verbose=1,validation_data=(VX,vy))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 128)           2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 264)               415008    
_________________________________________________________________
dense_5 (Dense)              (None, 64)                16960     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 195       
Total params: 2,992,163
Trainable params: 2,992,163
Non-trainable params: 0
_________________________________________________________________
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [14]:
pred = model.predict(TX)
pred=np.argmax(pred,axis=1)
test_acc = accuracy_score(ty,pred)
print("Test accuracy of singular LSTM model = " + str(test_acc) )

Test accuracy of singular LSTM model = 0.606


In [15]:
# Model no 4

cnn = Sequential()
cnn.add(InputLayer(input_shape=X.shape[1]))
cnn.add(Embedding(vocab_size, embed_size))
cnn.add(Conv1D(64, kernel_size=3))
cnn.add(Conv1D(32, kernel_size=3))
cnn.add(Conv1D(16, kernel_size=3))
cnn.add(GlobalAveragePooling1D())
cnn.add(Dense(units=8, activation='relu'))
cnn.add(Dense(units=3, activation='softmax'))

cnn.compile(optimizer=tf.keras.optimizers.Adam(), loss="sparse_categorical_crossentropy", metrics = ['accuracy'])

cnn.summary()

history4 = cnn.fit(X, y, epochs=10, batch_size=128,verbose=1,validation_data=(VX,vy))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 128)           2560000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 48, 64)            24640     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 46, 32)            6176      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 44, 16)            1552      
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_8 (Dense)              (None, 3)                

In [16]:
pred = cnn.predict(TX)
pred=np.argmax(pred,axis=1)
test_acc = accuracy_score(ty,pred)
print("Test accuracy of CNN model = " + str(test_acc) )

Test accuracy of CNN model = 0.6226666666666667


In [17]:
x1 = bimodel.predict(TX)
x2 = clmodel.predict(TX)
x3 = model.predict(TX)
x4 = cnn.predict(TX)

In [18]:
x5 = bimodel.predict(VX)
x6 = clmodel.predict(VX)
x7 = model.predict(VX)
x8 = cnn.predict(VX)

In [19]:
x9 = bimodel.predict(X)
x10 = clmodel.predict(X)
x11 = model.predict(X)
x12 = cnn.predict(X)

In [20]:
x_test = (x1+x2+x3+x4)/4
x_test=np.argmax(x_test,axis=1)
x_val = (x5+x6+x7+x8)/4
x_val=np.argmax(x_val,axis=1)
x_train = (x9+x10+x11+x12)/4
x_train=np.argmax(x_train,axis=1)

In [21]:
test_acc = accuracy_score(ty,x_test)
print("Test accuracy = " + str(test_acc) )

Test accuracy = 0.6236666666666667


In [22]:
val_acc = accuracy_score(vy,x_val)
print("Validation accuracy = " + str(val_acc) )

Validation accuracy = 0.7943333333333333


In [23]:
train_acc = accuracy_score(y,x_train)
print("Train accuracy = " + str(train_acc) )

Train accuracy = 0.8437028915992874


In [24]:
bimodel.save("bimodel.h5")
clmodel.save("clmodel.h5")
model.save("model.h5")
cnn.save("cnn.h5")