In [1]:
from gensim.models import Word2Vec

In [2]:
model = Word2Vec.load("300features_40minwords_10context")

In [6]:
vocab = set(model.wv.index2word) #index2word is a list that contains the names of the words in the model's vocabulary
print(len(vocab),len(model.wv.vocab))
print(model.wv.index2word.index("good"))
print(type(model.wv.index2word))

16490 16490
48
<class 'list'>


In [7]:
import pandas as pd
import nltk
import os
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re

In [9]:
train_data=pd.read_csv("../BagofWords/labeledTrainData.tsv",header=0,delimiter="\t",quoting=3)
test_data=pd.read_csv("../BagofWords/testData.tsv",header=0,delimiter="\t",quoting=3)

In [58]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(train_data,test_size=0.1,random_state=42)

In [59]:
train.shape

(22500, 3)

In [60]:
def review_to_words(raw_review):
    # 1. Remove HTML
    review_text=BeautifulSoup(raw_review).get_text()
    #2. Remove non letters
    letters_only=re.sub("[^a-zA-Z]"," ",review_text)
    #3. Convert to lower_case and split to individual letters(tokenizing)
    words=letters_only.lower().split()
    return words

In [61]:
def sentences_to_integers(review):
    sentences=[]
    if(len(review)>0):
        rv_words=review_to_words(review)
        for word in rv_words:
            if word in vocab:
                sentences.append(model.wv.index2word.index(word))
            else:
                sentences.append(0)
    return sentences
            
        

In [62]:
def build_dataset(data):
    train_reviews=data["review"]
    sentences=[]
    for review in train_reviews:
        sentences.append(sentences_to_integers(review))
    return sentences

In [63]:
train_sentences=build_dataset(train)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [64]:
print(sentences_to_integers('hello world'))
print(len(train_sentences),len(train),len(train['sentiment']))

[5097, 174]
22500 22500 22500




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [65]:
#convert vocab word vectors into numpy matrix
import numpy as np
vector_dim=300
embedding_matrix=np.zeros((len(model.wv.vocab),vector_dim))
for i in range(len(vocab)):
    embedding_vector=model.wv[model.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

In [66]:
embedding_matrix.shape

(16490, 300)

<b> Building our Keras Model </b>

In [67]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
# fix random seed for reproducibility
np.random.seed(7)

In [84]:
#feeding our embedding matrix to model
model1=Sequential()
embeddings=Embedding(input_dim=embedding_matrix.shape[0],
                     output_dim=embedding_matrix.shape[1],
                    weights=[embedding_matrix])
model1.add(embeddings)

In [85]:
#creating a conv layer followed by max pooling and then lstm and a dense layer connected to softmax
model1.add(Conv1D(filters=300, kernel_size=3, padding='same', activation='relu'))
model1.add(MaxPooling1D(pool_size=2))
model1.add(LSTM(300))


In [86]:
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model1.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 300)         4947000   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, None, 300)         270300    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, None, 300)         0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 301       
Total params: 5,938,801
Trainable params: 5,938,801
Non-trainable params: 0
_________________________________________________________________
None


In [87]:
max_review_length=200
X_train=sequence.pad_sequences(train_sentences,maxlen=max_review_length)

In [88]:
model1.fit(X_train,train['sentiment'], epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f6b9269a668>

In [98]:
#testing for epochs 4 
model1.fit(X_train,train['sentiment'], epochs=1, batch_size=64)

Epoch 1/1


<keras.callbacks.History at 0x7f6be0113400>

In [89]:
test_sentences=build_dataset(test)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [90]:
X_test = sequence.pad_sequences(test_sentences, maxlen=max_review_length)

In [99]:
scores = model1.evaluate(X_test,test['sentiment'], verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.72%


Now we are going to predict on our test data

In [93]:
print(test_data.columns)

Index(['id', 'review'], dtype='object')


In [94]:
test_data_sentences=build_dataset(test_data)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [95]:
X_test_real = sequence.pad_sequences(test_data_sentences, maxlen=max_review_length)

Predicting the outcome

In [100]:
predictions=model1.predict(X_test_real,batch_size=64,verbose=0)

In [101]:
p=predictions.reshape(25000)

output = pd.DataFrame( data={"id":test_data["id"], "sentiment":p} )

# Use pandas to write the comma-separated output file
output.to_csv( "lstm_rnn_epochs4.csv", index=False, quoting=3 )

<p><b> epochs=3 accuracy = 93.152%<b><p>
<p><b> epochs=4 accuracy= 92.883%<b><p>