<a href="https://colab.research.google.com/github/sudevansujit/Movie_Sentiment_Analysis/blob/master/Movie_Sentiment_Deep_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [2]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
downloaded = drive.CreateFile({'id':'1asZyLVRZ3QjWG4Pek-dAYYl9Kq9XZn3n'}) # replace the id with id of file you want to access
downloaded.GetContentFile('Cleaned_Movie_Review.csv')

In [4]:
import pandas as pd
import numpy as np

data = pd.read_csv('Cleaned_Movie_Review.csv', index_col = 'Unnamed: 0')
data.head()

Unnamed: 0,review,sentiment,new_review
0,one of the other reviewers has mentioned that ...,1,one reviewers mentioned watching oz episode ho...
1,a wonderful little production the filming tech...,1,wonderful little production filming technique ...
2,i thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,basically there s a family where a little boy ...,0,basically family little boy jake thinks zombie...
4,petter mattei s love in the time of money is a...,1,petter mattei s love time money visually stunn...


In [5]:
# build train and test datasets
X_review = data['new_review'].values
y_sentiment = data['sentiment'].values

X_train = X_review[:35000]
y_train = y_sentiment[:35000]

X_test = X_review[35000:]
y_test = y_sentiment[35000:]

X_train.shape, y_train.shape

((35000,), (35000,))

In [8]:
import gensim
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
# tokenize train reviews 
tokenized_train = [nltk.word_tokenize(text) for text in X_train]

# tokenize test reviews & encode test labels
tokenized_test = [nltk.word_tokenize(text) for text in X_test]

In [10]:
#Feature Engineering with word embeddings¶
w2v_num_features = 300
w2v_model = gensim.models.Word2Vec(tokenized_train, 
                                   size=w2v_num_features, 
                                   window=150,
                                   min_count=10, 
                                   workers=4, 
                                   iter=5)   

In [11]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [12]:
# generate averaged word vector features from word2vec model
avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, 
                                                     model=w2v_model,
                                                     num_features=w2v_num_features)
avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,
                                                    num_features=w2v_num_features)

print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)


Word2Vec model:> Train features shape: (35000, 300)  Test features shape: (15000, 300)


In [13]:
#Modeling with deep neural networks
def construct_deepnn_architecture(num_input_features):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, input_shape=(num_input_features,)))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(1))
    dnn_model.add(Activation('sigmoid'))

    dnn_model.compile(loss='binary_crossentropy', optimizer='adam',                 
                      metrics=['accuracy'])
    return dnn_model



In [14]:
w2v_dnn = construct_deepnn_architecture(num_input_features=w2v_num_features)

w2v_dnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               154112    
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               6

In [15]:
batch_size = 100
history = w2v_dnn.fit(avg_wv_train_features, y_train, epochs=10, batch_size=batch_size, 
            shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
y_pred = w2v_dnn.predict_classes(avg_wv_test_features)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("% Accuracy = ",accuracy_score(y_test, y_pred)*100)

% Accuracy =  88.02666666666667


In [18]:
print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

[[6657  833]
 [ 963 6547]]
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      7490
           1       0.89      0.87      0.88      7510

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000



In [19]:
# Thanks