In [1]:
import numpy as np
import pandas as pd

pos_train_data = pd.read_csv('train_pos.tsv',sep = '\t')
neg_train_data = pd.read_csv('train_neg.tsv',sep = '\t')
pos_test_data = pd.read_csv('test_pos.tsv',sep = '\t')
neg_test_data = pd.read_csv('test_neg.tsv',sep = '\t')

In [2]:
pos_train_data = pos_train_data[['Text','Sentiment']]
neg_train_data = neg_train_data[['Text','Sentiment']]
pos_test_data = pos_test_data[['Text','Sentiment']]
neg_test_data = neg_test_data[['Text','Sentiment']]

In [3]:
data_train = pd.concat([pos_train_data,neg_train_data],ignore_index = True)
data_train = data_train.sample(frac=1).reset_index(drop=True)
data_train.head()

Unnamed: 0,Text,Sentiment
0,I can't believe this is on DVD. Even less it w...,0
1,I admit I had no idea what to expect before vi...,1
2,This is very much not the sort of movie for wh...,1
3,Difficult to call The Grudge a horror movie. A...,0
4,In order to rate this movie fairly you have to...,1


In [4]:
len(data_train)

25000

In [5]:
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=1).reset_index(drop=True)
data_test.head()

Unnamed: 0,Text,Sentiment
0,If a copy of this movie fell into the wrong ha...,0
1,"My Mother Frank begins as a warm, amiable come...",1
2,"An amazing film, I've only just seen it and I ...",1
3,This movie is a perfect example of Barkers cin...,1
4,I do not find this show at all funny. I actual...,0


In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', punctuation)

def textclean(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if not word in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [7]:
reviews = []

for index,row in data_train.iterrows():
    text = (row['Text'].lower())    
    reviews.append(textclean(text))
reviews[0]

['ca',
 'believe',
 'dvd',
 'even',
 'less',
 'available',
 'local',
 'video',
 'br',
 'br',
 'argue',
 'good',
 'movie',
 'take',
 'consideration',
 'budget',
 'find',
 'funny',
 'would',
 'find',
 'bad',
 'whichever',
 'br',
 'br',
 'still',
 'funny',
 'read',
 'following',
 'another',
 'review',
 'dramatics',
 'aside',
 'love',
 'horror',
 'love',
 'something',
 'along',
 'lines',
 'duel',
 'updated',
 'little',
 'story',
 'pretty',
 'girls',
 'thrown',
 'love',
 'movie',
 'br',
 'br',
 'shame',
 'comparing',
 'two',
 'br',
 'br',
 'give',
 'since',
 'ca',
 'give',
 'see',
 'way',
 'movie',
 'could',
 'entertaining']

In [12]:
import gensim
from gensim.models import Word2Vec

n_dim = 100

w2v_model = Word2Vec(reviews,min_count=5,size=n_dim)

In [14]:
w2v_model.wv['nice']

array([ 0.34266835, -0.48946238,  0.16992585, -1.59589171,  0.20816202,
       -0.58867174,  0.42529151,  1.18220663,  1.0828265 , -0.31273049,
       -0.44492871,  0.37714729,  0.03535983,  1.28016639, -1.69150853,
        0.86141825,  0.74454904,  0.37987161,  1.29428494, -2.6220355 ,
       -1.67638373, -0.55473596, -0.31270647,  0.40307197,  0.74079692,
       -0.90968764,  0.27629927, -0.55407161, -0.76619291,  0.47750685,
       -1.01419616,  0.67485195,  1.66792023, -0.93714583, -0.83445334,
       -1.12781429,  0.07898249,  0.70058185, -0.47937781,  1.26753855,
       -1.76214159, -1.39057183,  0.027617  , -0.2262345 , -0.32025877,
        1.23497283, -0.57548404,  0.26772857,  0.01747596, -0.83560508,
        1.2832607 ,  1.0841378 ,  0.09105023,  0.03972418,  1.04888999,
       -0.07206379,  0.3120141 ,  1.21296299, -0.23066349, -1.21730089,
        0.81291986, -0.49930614, -0.88758504, -0.66035891,  1.05875015,
       -1.22189093, -0.45270762, -0.58284825, -1.01936829,  0.44

In [15]:
import itertools
linked_reviews = list(itertools.chain.from_iterable(reviews))

vocab_freq = dict()

linked_reviews[1]

'believe'

In [16]:
for word in linked_reviews:
    if word not in vocab_freq:
        vocab_freq[word] = 1
    else:
        vocab_freq[word] += 1

In [17]:
vocab_freq

{'preserving': 7,
 'massachusettes': 1,
 'equal': 137,
 'slender': 8,
 'turveydrop': 1,
 'rohauer': 1,
 'alfre': 17,
 'emptiest': 1,
 'lumber': 4,
 'slambang': 1,
 'podges': 1,
 'chalice': 8,
 'goggles': 7,
 'slowed': 16,
 'bodied': 3,
 'mythically': 2,
 'sanitorium': 3,
 'unsettle': 1,
 'gram': 36,
 'olajima': 1,
 'vogel': 4,
 'purchased': 90,
 'knifing': 1,
 'leander': 1,
 'accomplices': 5,
 'ungratifying': 1,
 'networked': 1,
 'approporiately': 1,
 'hocking': 1,
 'lambastes': 1,
 'olosio': 1,
 'aya': 8,
 'unbearably': 29,
 'regroup': 5,
 'entertainers': 15,
 'cowed': 3,
 'unwrapped': 1,
 'westlake': 2,
 'uni': 8,
 'incidentals': 1,
 'dramaticisation': 1,
 'reestablishing': 1,
 'kuala': 1,
 'granite': 2,
 'whisperish': 1,
 'frech': 1,
 'garish': 21,
 'petersson': 2,
 'covent': 1,
 'lakin': 1,
 'despaired': 2,
 'inexistent': 3,
 'ditsy': 12,
 'comig': 1,
 'congratulations': 29,
 'graphics': 164,
 'snickered': 2,
 'coachella': 1,
 'objection': 12,
 'khali': 2,
 'insensitive': 20,
 'lac

In [18]:
import operator

sorted_vocab_freq = list(reversed(sorted(vocab_freq.items(), key=operator.itemgetter(1))))

In [19]:
len(sorted_vocab_freq)

71238

In [30]:
review_lengths = pd.DataFrame([len(review) for review in reviews])
review_lengths.columns = ['Len']

review_lengths

Unnamed: 0,Len
0,63
1,101
2,293
3,59
4,73
5,71
6,63
7,53
8,61
9,109


In [31]:
review_lengths.describe()

Unnamed: 0,Len
count,25000.0
mean,118.36848
std,89.42677
min,4.0
25%,63.0
50%,88.0
75%,144.0
max,1409.0


In [32]:
#Removal of outliers using Tukey's Method
first_q = review_lengths.Len.quantile([0.25])[0.25]
third_q = review_lengths.Len.quantile([0.75])[0.75]

upper_threshold = third_q + 1.5*(third_q-first_q)
lower_threshold = first_q - 1.5*(third_q-first_q)

upper_threshold,lower_threshold

(265.5, -58.5)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform(reviews)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [34]:
tfidf['try']

3.7163240052293349

In [35]:
def create_word_vector(l,size):
    vector = np.zeros(size).reshape((1,size))
    count = 0.
    for word in l:
        try:
            vector += w2v_model[word].reshape((1, size)) * tfidf[word]
            count+=1
        except KeyError:
            continue
            
    if count!=0:
        vector /= count
    return vector        

In [65]:
X_train = []
y_train = []

for i in range(len(data_train)):
    converted_review = create_word_vector(reviews[i],n_dim)
    X_train.append(converted_review)
    y_train.append(data_train['Sentiment'][i])

  


In [66]:
from sklearn.preprocessing import scale

X_train = np.concatenate(X_train)
X_train = scale(X_train)
y_train = np.array(y_train)


In [67]:
X_train.shape

(25000, 100)

In [68]:
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=0.3).reset_index(drop=True)

validation_reviews = []

for index,row in data_test.iterrows():
    text = (row['Text'].lower())
    validation_reviews.append(textclean(text))
    
X_val = []
y_val = []

for i in range(len(data_test)):
    converted_review = create_word_vector(validation_reviews[i],n_dim)
    X_val.append(converted_review)
    y_val.append(data_test['Sentiment'][i])
        
X_val = np.concatenate(X_val)
X_val = scale(X_val)
y_val = np.array(y_val)

  


In [69]:
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation

model = Sequential()

model.add(Dense(64,activation = 'relu',input_shape=X_train[0].shape))
model.add(Dropout(0.2))
model.add(Dense(32,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation = 'sigmoid'))

In [70]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 64)                6464      
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 32)                2080      
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 33        
Total params: 8,577
Trainable params: 8,577
Non-trainable params: 0
_________________________________________________________________


In [71]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [72]:
model.fit(X_train, y_train,validation_data = (X_val,y_val), epochs=15, batch_size=32, verbose=2)

Train on 25000 samples, validate on 7500 samples
Epoch 1/15
 - 1s - loss: 0.4085 - acc: 0.8223 - val_loss: 0.3682 - val_acc: 0.8352
Epoch 2/15
 - 1s - loss: 0.3708 - acc: 0.8406 - val_loss: 0.3677 - val_acc: 0.8368
Epoch 3/15
 - 1s - loss: 0.3613 - acc: 0.8452 - val_loss: 0.3607 - val_acc: 0.8404
Epoch 4/15
 - 1s - loss: 0.3559 - acc: 0.8487 - val_loss: 0.3557 - val_acc: 0.8441
Epoch 5/15
 - 1s - loss: 0.3522 - acc: 0.8504 - val_loss: 0.3558 - val_acc: 0.8468
Epoch 6/15
 - 1s - loss: 0.3509 - acc: 0.8506 - val_loss: 0.3579 - val_acc: 0.8445
Epoch 7/15
 - 1s - loss: 0.3475 - acc: 0.8513 - val_loss: 0.3534 - val_acc: 0.8480
Epoch 8/15
 - 1s - loss: 0.3430 - acc: 0.8546 - val_loss: 0.3546 - val_acc: 0.8457
Epoch 9/15
 - 1s - loss: 0.3431 - acc: 0.8536 - val_loss: 0.3492 - val_acc: 0.8469
Epoch 10/15
 - 1s - loss: 0.3408 - acc: 0.8567 - val_loss: 0.3523 - val_acc: 0.8451
Epoch 11/15
 - 1s - loss: 0.3387 - acc: 0.8569 - val_loss: 0.3575 - val_acc: 0.8463
Epoch 12/15
 - 1s - loss: 0.3373 - a

<keras.callbacks.History at 0x7f0e2ce11860>