In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

from IPython import display

In [2]:
path = "rt-polaritydata/rt-polaritydata/"

pos_path = os.path.join(path, 'rt-polarity.pos')
neg_path = os.path.join(path, 'rt-polarity.neg')

def load_review(path, is_pos=True):
    with open(path) as f:
        review = pd.DataFrame({'review':f.read().splitlines()})
    review['sentiment'] = 1 if is_pos else 0
    return review

pos_review = load_review(pos_path, is_pos=True)
neg_review = load_review(neg_path, is_pos=False)

all_reviews = pd.concat([pos_review, neg_review])
all_reviews.head()

Unnamed: 0,review,sentiment
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [3]:
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin", binary=True)

#### Sentence is represented as sum of each word's vector

In [34]:
from stop_words import get_stop_words

MAX_WORDS = 25
EMBEDDING_SIZE = 300

def process_review(review, max_words=MAX_WORDS):
    review = tf.keras.preprocessing.text.text_to_word_sequence(review)
    review = [word for word in review if word not in get_stop_words('english')]
    review_ = []
    for r in review:
        try:
            review_.append(word2vec_model[r])
        except KeyError:
            pass # word not found in the model
    review_ = review_[:max_words]
    return np.sum(np.asarray(review_), axis=0) 

In [35]:
processed_review = all_reviews.review.apply(lambda review: process_review(review))

In [36]:
from sklearn.model_selection import train_test_split

X = np.asarray(processed_review).flatten()
X_reshaped = np.zeros([len(X), X[0].shape[0]])
for i in range(len(X)):
    X_reshaped[i, :] = X[i]
y = all_reviews.sentiment.values

X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.1, random_state=42)

In [37]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9595, 300), (1067, 300), (9595,), (1067,))

In [38]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(EMBEDDING_SIZE,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu', input_shape=(EMBEDDING_SIZE,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [39]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [40]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 256)               77056     
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_18 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_19 (Dense)             (None, 1)                

In [41]:
EPOCH = 20
BATCH_SIZE = 256

history = model.fit(X_train, 
                    y_train, 
                    epochs=EPOCH, 
                    batch_size=BATCH_SIZE,
                    validation_data=(X_test, y_test))

Train on 9595 samples, validate on 1067 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [47]:
def predict(review, max_words=MAX_WORDS, shape=(EMBEDDING_SIZE,)):
    input_ = [p for p in process_review(review, max_words=max_words)]
    input_ = tf.cast(input_, tf.float32)
    input_ = tf.reshape(input_, shape)
    input_ = input_[np.newaxis, ...]
    prediction = model.predict(input_)[0][0]
    print(prediction)
    if prediction > 0.5:
        print('Positive review with probability: {:.2f}%'.format(prediction * 100))
    else:
        print('Negative review with probability: {:.2f}%'.format(100 - prediction * 100))

In [48]:
predict("this film was quite good")

0.73616076
Positive review with probability: 73.62%
