In [51]:
import numpy as np
import pandas as pd

In [52]:
df = pd.read_csv("C:/Users/Dell/Desktop/datasets/imdb_labelled.txt", delimiter = '\t', names = ['Reviews','Status'])
df.head()

Unnamed: 0,Reviews,Status
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [53]:
df.shape

(748, 2)

In [54]:
df.size

1496

In [55]:
df['Reviews'][2]

'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  '

In [56]:
from collections import Counter
print(Counter(df['Status']))

Counter({1: 386, 0: 362})


In [57]:
df.isnull().sum()

Reviews    0
Status     0
dtype: int64

In [58]:
X = df['Reviews']
Y = df['Status']

In [59]:
X

0      A very, very, very slow-moving, aimless movie ...
1      Not sure who was more lost - the flat characte...
2      Attempting artiness with black & white and cle...
3           Very little music or anything to speak of.  
4      The best scene in the movie was when Gerardo i...
                             ...                        
743    I just got bored watching Jessice Lange take h...
744    Unfortunately, any virtue in this film's produ...
745                     In a word, it is embarrassing.  
746                                 Exceptionally bad!  
747    All in all its an insult to one's intelligence...
Name: Reviews, Length: 748, dtype: object

In [60]:
Y

0      0
1      0
2      0
3      0
4      1
      ..
743    0
744    0
745    0
746    0
747    0
Name: Status, Length: 748, dtype: int64

# Data preprocessing

In [61]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [62]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()
corpus = []

In [63]:
for i in range(0,len(X)):
    review = re.sub('[^a-zA-Z]', ' ', X[i])
    review = review.lower()
    review = review.split()
    #stemming
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    #lemmatization
    #review = [wordnet.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [64]:
corpus

['slow move aimless movi distress drift young man',
 'sure lost flat charact audienc nearli half walk',
 'attempt arti black white clever camera angl movi disappoint becam even ridicul act poor plot line almost non exist',
 'littl music anyth speak',
 'best scene movi gerardo tri find song keep run head',
 'rest movi lack art charm mean empti work guess empti',
 'wast two hour',
 'saw movi today thought good effort good messag kid',
 'bit predict',
 'love cast jimmi buffet scienc teacher',
 'babi owl ador',
 'movi show lot florida best made look appeal',
 'song best muppet hilari',
 'cool',
 'right case movi deliv everyth almost right face',
 'averag act main person low budget clearli see',
 'review long overdu sinc consid tale two sister singl greatest film ever made',
 'put gem movi term screenplay cinematographi act post product edit direct aspect film make',
 'practic perfect true masterpiec sea faux masterpiec',
 'structur film easili tightli construct histori cinema think film so

In [65]:
len(corpus)

748

In [66]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from keras.layers import GRU
from keras.layers.embeddings import Embedding

In [67]:
voc_size = 10000

# One Hot Representation

In [68]:
onehot_repr = [one_hot(word, voc_size) for word in corpus]

In [69]:
onehot_repr

[[8107, 31, 5700, 2899, 2847, 6033, 1074, 6487],
 [5630, 9685, 8105, 2241, 1289, 5127, 4693, 5624],
 [9385,
  2951,
  2957,
  6630,
  7961,
  7355,
  5250,
  2899,
  9091,
  742,
  9731,
  8444,
  7759,
  6811,
  8267,
  3221,
  2311,
  8644,
  5946],
 [5940, 3294, 2853, 4790],
 [942, 1045, 2899, 8637, 9736, 2655, 1243, 9820, 939, 9035],
 [2808, 2899, 4142, 8410, 3306, 3842, 5231, 6827, 5595, 5231],
 [1097, 5651, 2716],
 [4804, 2899, 2905, 2266, 5383, 641, 5383, 3917, 6422],
 [8426, 7159],
 [3937, 1611, 3699, 1178, 9590, 2359],
 [8465, 8831, 4928],
 [2899, 522, 5445, 9981, 942, 7139, 1742, 5706],
 [1243, 942, 8008, 8580],
 [1781],
 [1909, 1032, 2899, 9090, 6083, 2311, 1909, 9460],
 [434, 7759, 129, 264, 4665, 3651, 5930, 2485],
 [2573, 3723, 1768, 3024, 2311, 7739, 5651, 1498, 3330, 1185, 1352, 644, 7139],
 [6283,
  933,
  2899,
  3695,
  34,
  9655,
  7759,
  9880,
  4365,
  9420,
  2703,
  3141,
  1352,
  1018],
 [9238, 5573, 376, 8671, 590, 2223, 8671],
 [4695,
  1352,
  2389,
  956

In [70]:
onehot_repr[2]

[9385,
 2951,
 2957,
 6630,
 7961,
 7355,
 5250,
 2899,
 9091,
 742,
 9731,
 8444,
 7759,
 6811,
 8267,
 3221,
 2311,
 8644,
 5946]

# Pad_sequences

In [80]:
max_length = 24

In [81]:
pad_docs = pad_sequences(onehot_repr, padding = 'pre', maxlen = max_length)

In [82]:
pad_docs

array([[   0,    0,    0, ..., 6033, 1074, 6487],
       [   0,    0,    0, ..., 5127, 4693, 5624],
       [   0,    0,    0, ..., 2311, 8644, 5946],
       ...,
       [   0,    0,    0, ...,    0, 3402, 9444],
       [   0,    0,    0, ...,    0, 6904, 6123],
       [   0,    0,    0, ...,  597, 1097, 5790]])

In [83]:
pad_docs[4]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,  942, 1045, 2899, 8637, 9736, 2655, 1243, 9820,
        939, 9035])

# Model Creation

In [84]:
dim = 100

In [85]:
model = Sequential()

In [86]:
#Embedding Layer
model.add(Embedding(voc_size,dim,input_length = max_length))
model.add(Dropout(0.3))
#add 1st GRU layer
model.add(GRU(100,activation="tanh",return_sequences=True))
model.add(Dropout(0.3))
#add 2ND GRU layer
model.add(GRU(100,activation="tanh",return_sequences=True))
model.add(Dropout(0.3))
#add 3rd GRU layer
model.add(GRU(100,activation="tanh",return_sequences=False))
model.add(Dropout(0.3))

In [87]:
#output layer
model.add(Dense(1,activation = 'sigmoid'))

In [88]:
#compile layer
model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])

In [89]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 24, 100)           1000000   
_________________________________________________________________
dropout_9 (Dropout)          (None, 24, 100)           0         
_________________________________________________________________
gru_7 (GRU)                  (None, 24, 100)           60300     
_________________________________________________________________
dropout_10 (Dropout)         (None, 24, 100)           0         
_________________________________________________________________
gru_8 (GRU)                  (None, 24, 100)           60300     
_________________________________________________________________
dropout_11 (Dropout)         (None, 24, 100)           0         
_________________________________________________________________
gru_9 (GRU)                  (None, 100)              

In [90]:
inputs = np.array(pad_docs)
Y = np.array(Y)

In [91]:
inputs.shape, Y.shape

((748, 24), (748,))

In [92]:
#spliting the train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(inputs, Y, test_size = 0.24, random_state = 0)

In [93]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((568, 24), (180, 24), (568,), (180,))

In [94]:
rk = model.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 15, batch_size = 20)

Train on 568 samples, validate on 180 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [95]:
Y_pred = model.predict_classes(X_test)

In [96]:
Y_pred

array([[1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
    

In [97]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [98]:
print(confusion_matrix(Y_test, Y_pred))

[[57 40]
 [19 64]]


In [99]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.75      0.59      0.66        97
           1       0.62      0.77      0.68        83

    accuracy                           0.67       180
   macro avg       0.68      0.68      0.67       180
weighted avg       0.69      0.67      0.67       180



In [100]:
print(accuracy_score(Y_test, Y_pred))

0.6722222222222223
