In [1]:

import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
import re

In [3]:
# Loading the dataset
data = pd.read_csv('balanced.csv')

data = data[['Review','Sentiment']]

In [7]:
data.head()

Unnamed: 0,Review,Sentiment
0,best candy corn on the planet ill keep this sh...,Positive
1,cat food my cats eat it that is all i can say ...,Positive
2,onions overwhelm otherwise lowkey flavor the o...,Negative
3,yummy tasted good spicy those that dont like s...,Positive
4,good flavor the product is the same as what we...,Positive


In [10]:
data = data.head(10000)

In [11]:
    
max_fatures = 1000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['Review'].values)
X = tokenizer.texts_to_sequences(data['Review'].values)
X = pad_sequences(X)
X[:3]

array([[  0,   0,   0, ...,   4, 301, 156],
       [  0,   0,   0, ...,  48, 290, 141],
       [  0,   0,   0, ..., 104, 144,  10]])

In [21]:
embed_dim = 64
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1265, 64)          64000     
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 1265, 64)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               204624    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 394       
Total params: 269,018
Trainable params: 269,018
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
Y = pd.get_dummies(data['Sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 40)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(8000, 1265) (8000, 2)
(2000, 1265) (2000, 2)


In [23]:
batch_size = 64
model.fit(X_train, Y_train, epochs = 2, batch_size=batch_size, verbose = 1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1e1db6db470>

In [24]:
model.save("model.h5")

In [17]:
Y_pred = model.predict_classes(X_test,batch_size = batch_size)



In [19]:
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print(confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

[[833 159]
 [147 861]]
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       992
           1       0.84      0.85      0.85      1008

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000

