## <center> Sentiment Analysis on Movie Reviews

In [47]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

#Text Processing Packages
import re
import bs4
from bs4 import BeautifulSoup as bs

#NLP packages
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

#Model packages
import keras
from keras.models import Sequential
from keras.layers import Dense, GlobalMaxPooling1D, Bidirectional, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score
import gensim
from gensim.models import Word2Vec

In [9]:
#Filter Warnings
import warnings
warnings.filterwarnings('ignore')

In [10]:
#Read dataset
df_train = pd.read_csv('data/LabeledTrainData.tsv', delimiter='\t')
df_train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [11]:
#Read test dataset
df_test = pd.read_csv('data/testData.tsv', delimiter='\t')
df_test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [13]:
df_train.shape, df_test.shape

((25000, 3), (25000, 2))

In [14]:
#Let's backup both dataset
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [34]:
#Let's create a function to clean the data
def clean_text(text):
    #Steps - convert to lower, remove non-words, remove digits, remove single letters, remove spaces, remove stopwords
    text = bs(text, 'lxml').get_text()
    text = text.lower()
    text = re.sub('\W', ' ', text)
    text = re.sub('\d+', ' ', text)
    text = re.sub('\s+[a-z]\s+', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.split(' ')
    text = [word for word in text if word not in stop_words]
    text = [WordNetLemmatizer().lemmatize(word) for word in text] #additional lemmatize the words
    text = ' '.join(text)
    return text

In [28]:
#Let's test the clean text function
temp = df_train.loc[0,'review']
temp

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [35]:
clean_text(temp)

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighty maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle message mj feeling towards press also obvious message drug bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fan would say made fan true really nice actual feature film bit finally start minute excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord want mj dead bad beyond mj overheard plan nah joe pesci character ranted wanted people know supplying drug etc dunno maybe hate mj music lot cool thing like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually director hate working one kid let a

In [37]:
#Lets process all the review in train
df_train['cleaned_review'] = df_train['review'].apply(lambda x: clean_text(x))

In [38]:
#Clean test dataset reviews
df_test['cleaned_review'] = df_test['review'].apply(lambda x: clean_text(x))
df_test['cleaned_review'].head()

0    naturally film main theme mortality nostalgia ...
1    movie disaster within disaster film full great...
2    movie kid saw tonight child loved one point ki...
3    afraid dark left impression several different ...
4    accurate depiction small time mob life filmed ...
Name: cleaned_review, dtype: object

### Building Word Vector

In [39]:
#Preprocessing data to input into wordvector
corpus_train = [review.split(' ') for review in df_train['cleaned_review']]

In [40]:
WordVector = Word2Vec(sentences=corpus_train, min_count=4, size=300)
# embedding_vector_size = 256
# trigrams_model = Word2Vec(
#     sentences = trigrams[bigrams[all_reviews]],
#     size = embedding_vector_size,
#     min_count=3, window=5, workers=4)

In [41]:
#Let's check the vocabulary length
len(WordVector.wv.vocab)

28627

In [42]:
#Let's check for similar words to see how word vec learnt
WordVector.wv.similar_by_word('good')

[('decent', 0.7355197668075562),
 ('great', 0.6698894500732422),
 ('alright', 0.6664667129516602),
 ('bad', 0.6496267318725586),
 ('okay', 0.643833339214325),
 ('nice', 0.630272626876831),
 ('ok', 0.6258782744407654),
 ('fine', 0.6013451218605042),
 ('cool', 0.6003857851028442),
 ('awesome', 0.5974669456481934)]

In [45]:
WordVector.wv.similarity('film', 'movie')

0.7643779

### Building Neural Network Model

In [49]:
model = Sequential()

model.add(Embedding(input_dim = WordVector.wv.vectors.shape[0], output_dim = WordVector.wv.vectors.shape[1], 
                    weights=[WordVector.wv.vectors]))

model.add(Bidirectional(LSTM(32, recurrent_dropout=0.1, return_sequences=True)))

model.add(GlobalMaxPooling1D())

model.add(Dense(32, activation='relu'))

model.add(Dropout(rate = 0.2))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 300)         8588100   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 64)          85248     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 8,675,461
Trainable params: 8,675,461
Non-trainable params: 0
____________________________________________

In [None]:
#Train the model
model.fit()

## Phrases

In [50]:
from gensim.models import Phrases

In [54]:
bigrams = Phrases(corpus_train)

In [64]:
x = bigrams.vocab
len(x)

1740753

In [63]:
x

defaultdict(int,
            {b'stuff': 1181,
             b'going': 4146,
             b'stuff_going': 6,
             b'moment': 2775,
             b'going_moment': 2,
             b'mj': 35,
             b'moment_mj': 2,
             b'started': 963,
             b'mj_started': 1,
             b'listening': 189,
             b'started_listening': 1,
             b'music': 3061,
             b'listening_music': 7,
             b'watching': 4605,
             b'music_watching': 1,
             b'odd': 582,
             b'watching_odd': 2,
             b'documentary': 1074,
             b'odd_documentary': 1,
             b'watched': 2236,
             b'documentary_watched': 3,
             b'wiz': 11,
             b'watched_wiz': 1,
             b'wiz_watched': 1,
             b'moonwalker': 24,
             b'watched_moonwalker': 1,
             b'maybe': 2340,
             b'moonwalker_maybe': 1,
             b'want': 4989,
             b'maybe_want': 9,
             b'get': 12513,

In [60]:
print(bigrams['space station near the solar system new york'.split()])

['space_station', 'near', 'the', 'solar', 'system', 'new_york']


In [65]:
trigrams = Phrases(sentences=bigrams[corpus_train])

In [66]:
trigrams['space station near the solar system new york'.split()]

['space_station', 'near', 'the', 'solar', 'system', 'new_york']

---

In [68]:
print('Convert sentences to sentences with ngrams...', end='\r')
sents = trigrams[bigrams[corpus_train]]
print('Convert sentences to sentences with ngrams... (done)')

Convert sentences to sentences with ngrams...Convert sentences to sentences with ngrams... (done)


In [67]:
embedding_vector_size = 256
trigrams_model = Word2Vec(
    sentences = sents,
    size = embedding_vector_size,
    min_count=3, window=5, workers=4)

In [79]:
trigrams_model.wv.similar_by_word('great')

[('wonderful', 0.8543416261672974),
 ('amazing', 0.8235388398170471),
 ('excellent', 0.8076362609863281),
 ('fantastic', 0.7923934459686279),
 ('perfect', 0.767772376537323),
 ('fine', 0.7621510624885559),
 ('incredible', 0.7287273406982422),
 ('brilliant', 0.7267520427703857),
 ('awesome', 0.7170404195785522),
 ('outstanding', 0.7048456072807312)]

In [74]:
%%time
def vectorize_data(data, vocab: dict) -> list:
    print('Vectorize sentences...', end='\r')
    keys = list(vocab.keys())
    filter_unknown = lambda word: vocab.get(word, None) is not None
    encode = lambda review: list(map(keys.index, filter(filter_unknown, review)))
    vectorized = list(map(encode, data))
    print('Vectorize sentences... (done)')
    return vectorized

Wall time: 0 ns


In [113]:
vocab = trigrams_model.wv.vocab
keys = list(vocab.keys())
filter_unknown = lambda word: vocab.get(word, None) is not None
encode = lambda review: list(map(keys.index, filter(filter_unknown, review)))
list(map(encode, ['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary']))
# list(map(encode, ['stuff', 'going', 'moment']))

[[11065, 22428, 714, 14334, 14334],
 [170, 13654, 170],
 [4738, 4738, 3933, 13654, 22428],
 [4738, 6469],
 [11065, 22428, 4150, 22428, 3933, 21851],
 [7511, 11065, 22428, 3933, 13654, 13654, 170],
 [4738, 714, 11065, 8258],
 [3683, 22428, 8258, 17387, 13654, 170],
 [21851, 21851],
 [21851, 8258, 714, 4738, 3933, 13654, 22428, 4150, 12102]]

In [111]:
x = vocab.get('stuff', None)
x

<gensim.models.keyedvectors.Vocab at 0x1cd85506b70>

In [106]:
list(map(keys.index,'stuff'))

[11065, 22428, 714, 14334, 14334]

In [80]:
sentence = vectorize_data(sents, vocab=trigrams_model.wv.vocab)

Vectorize sentences... (done)


In [None]:
X_pad = pad_sequences(
    sequences=sentence,
    maxlen=150,
    padding='post')
print('Transform sentences to sequences... (done)')

---

In [None]:
%%time
def vectorize_data(data, vocab: dict) -> list:
    print('Vectorize sentences...', end='\r')
    keys = list(vocab.keys())
    filter_unknown = lambda word: vocab.get(word, None) is not None
    encode = lambda review: list(map(keys.index, filter(filter_unknown, review)))
    vectorized = list(map(encode, data))
    print('Vectorize sentences... (done)')
    return vectorized

print('Convert sentences to sentences with ngrams...', end='\r')
X_data = trigrams[bigrams[X_train_data]]
print('Convert sentences to sentences with ngrams... (done)')
input_length = 150
X_pad = pad_sequences(
    sequences=vectorize_data(X_data, vocab=trigrams_model.wv.vocab),
    maxlen=input_length,
    padding='post')
print('Transform sentences to sequences... (done)')