In [1]:
import numpy as np
import pandas as pd

pos_train_data = pd.read_csv('train_pos.tsv',sep = '\t')
neg_train_data = pd.read_csv('train_neg.tsv',sep = '\t')
pos_test_data = pd.read_csv('test_pos.tsv',sep = '\t')
neg_test_data = pd.read_csv('test_neg.tsv',sep = '\t')

In [2]:
pos_train_data = pos_train_data[['Text','Sentiment']]
neg_train_data = neg_train_data[['Text','Sentiment']]
pos_test_data = pos_test_data[['Text','Sentiment']]
neg_test_data = neg_test_data[['Text','Sentiment']]

In [3]:
data_train = pd.concat([pos_train_data,neg_train_data],ignore_index = True)
data_train = data_train.sample(frac=1).reset_index(drop=True)
data_train.head()

Unnamed: 0,Text,Sentiment
0,Im still in doubt if this is just a horrible m...,0
1,I used to watch this on either HBO or Showtime...,1
2,I just watched this for the first time in a lo...,1
3,I found the writing in this movie absolutely t...,0
4,"In all honesty, this series is as much a class...",1


In [4]:
len(data_train)

25000

In [5]:
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=1).reset_index(drop=True)
data_test.head()

Unnamed: 0,Text,Sentiment
0,This movie totally sucked!!! Don't even rent i...,0
1,POSSIBLE SPOILERS --<br /><br />I love Dennis ...,0
2,***SPOILERS*** ***SPOILERS*** What's going on ...,0
3,No one can argue with it. This IS and WILL BE ...,1
4,"A fine young cast is wasted in this empty, maw...",0


In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', punctuation)

def textclean(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if not word in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [7]:
reviews = []

for index,row in data_train.iterrows():
    text = (row['Text'].lower())    
    reviews.append(textclean(text))
reviews[0]

['im',
 'still',
 'doubt',
 'horrible',
 'movie',
 'worse',
 'movie',
 'ever',
 'saw',
 'actors',
 'painful',
 'impossible',
 'get',
 'br',
 'br',
 'waist',
 'time',
 'movie',
 'submitting',
 'comment',
 'agreeing',
 'terms',
 'laid',
 'copyright',
 'statement',
 'submission',
 'must',
 'original',
 'work',
 'comments',
 'normally',
 'posted',
 'site',
 'within',
 'business',
 'days',
 'comments',
 'meet',
 'guidelines',
 'posted',
 'please',
 'write',
 'english',
 'html',
 'boards',
 'supported',
 'though',
 'paragraph',
 'breaks',
 'inserted',
 'leave',
 'blank',
 'line',
 'submitting',
 'comment',
 'agreeing',
 'terms',
 'laid',
 'copyright',
 'statement',
 'submission',
 'must',
 'original',
 'work',
 'comments',
 'normally',
 'posted',
 'site',
 'within',
 'business',
 'days',
 'comments',
 'meet',
 'guidelines',
 'posted',
 'please',
 'write',
 'english',
 'html',
 'boards',
 'supported',
 'though',
 'paragraph',
 'breaks',
 'inserted',
 'leave',
 'blank',
 'line',
 'paragraph']

In [8]:
import itertools
linked_reviews = list(itertools.chain.from_iterable(reviews))

vocab_freq = dict()

linked_reviews[1]

'still'

In [9]:
for word in linked_reviews:
    if word not in vocab_freq:
        vocab_freq[word] = 1
    else:
        vocab_freq[word] += 1

In [10]:
vocab_freq

{'capture': 281,
 'mathematicians': 2,
 'vidpic': 1,
 'charactures': 1,
 'cinematographers': 8,
 'mystics': 5,
 'blankman': 3,
 'jeayes': 1,
 'spiralled': 1,
 'splattered': 12,
 'stronghold': 3,
 'katch': 2,
 'proud': 181,
 'diwana': 1,
 'oldman': 7,
 'spicing': 2,
 'radioland': 1,
 'kraft': 6,
 'retromedia': 6,
 'dawnfall': 1,
 'baltic': 4,
 'analogies': 9,
 'seeming': 56,
 'overplaying': 4,
 'scorer': 1,
 'kastle': 1,
 'hulbert': 1,
 'casual': 68,
 'sheilah': 3,
 'futuristic': 112,
 'hillsides': 1,
 'massacrenot': 1,
 'inanely': 1,
 'anecdotes': 10,
 'socal': 3,
 'experiments': 85,
 'uselessly': 6,
 'gremlin': 4,
 'graff': 1,
 'buster': 74,
 'guility': 1,
 'intricate': 57,
 'domestication': 5,
 'analyzer': 1,
 'gameboys': 1,
 'biryani': 2,
 'feasible': 5,
 'tampers': 1,
 'discipleship': 1,
 'mileage': 6,
 'transporting': 10,
 'aint': 6,
 'cellophane': 1,
 'scheffer': 2,
 'dwarfs': 23,
 'greenaway': 37,
 'episodes': 919,
 'tatou': 3,
 'estes': 9,
 'veterinarian': 5,
 'gawping': 1,
 'c

In [11]:
import operator

sorted_vocab_freq = list(reversed(sorted(vocab_freq.items(), key=operator.itemgetter(1))))

In [12]:
len(sorted_vocab_freq)

71238

In [13]:
TOTAL_VOCAB = 5000

word_to_id = dict()
id_to_word = dict()
for i in range(TOTAL_VOCAB):
    word_to_id[sorted_vocab_freq[i][0]] = i
    id_to_word[i] = sorted_vocab_freq[i][0]

In [14]:
id_to_word[0]

'br'

In [15]:
review_lengths = pd.DataFrame([len(review) for review in reviews])
review_lengths.columns = ['Len']

review_lengths

Unnamed: 0,Len
0,89
1,226
2,56
3,66
4,76
5,25
6,29
7,237
8,166
9,227


In [16]:
review_lengths.describe()

Unnamed: 0,Len
count,25000.0
mean,118.36848
std,89.42677
min,4.0
25%,63.0
50%,88.0
75%,144.0
max,1409.0


In [17]:
#Removal of outliers using Tukey's Method
first_q = review_lengths.Len.quantile([0.25])[0.25]
third_q = review_lengths.Len.quantile([0.75])[0.75]

upper_threshold = third_q + 1.5*(third_q-first_q)
lower_threshold = first_q - 1.5*(third_q-first_q)

upper_threshold,lower_threshold

(265.5, -58.5)

In [18]:
def convert(l):
    new_l = []
    for word in l:
        if word in word_to_id:
            new_l.append(word_to_id[word])
    return new_l

In [19]:
len(reviews)

25000

In [20]:
len(data_train['Sentiment'])

25000

In [21]:
X_train = []
y_train = []

for i in range(len(data_train)):
    converted_review = convert(reviews[i])
    if len(converted_review) <= upper_threshold:
        X_train.append(converted_review)
        y_train.append(data_train['Sentiment'][i])

In [22]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [23]:
import keras
from keras.preprocessing import sequence

X_train = sequence.pad_sequences(X_train, maxlen=int(upper_threshold),value = 0)

Using TensorFlow backend.


In [24]:
X_train.shape,y_train.shape

((24011, 265), (24011,))

In [25]:
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=0.3).reset_index(drop=True)

validation_reviews = []

for index,row in data_test.iterrows():
    text = (row['Text'].lower())
    validation_reviews.append(textclean(text))
    
X_val = []
y_val = []

for i in range(len(data_test)):
    converted_review = convert(validation_reviews[i])
    if len(converted_review) <= upper_threshold:
        X_val.append(converted_review)
        y_val.append(data_test['Sentiment'][i])
        
X_val = np.array(X_val)
X_val = sequence.pad_sequences(X_val, maxlen=int(upper_threshold),value = 0)
y_val = np.array(y_val)

In [26]:
X_val.shape

(7238, 265)

In [29]:
from keras.models import Sequential
from keras.layers import Dense,Activation,Dropout,Conv1D,Flatten
from keras.layers import Embedding

EMBEDDING_LEN = 32

model = Sequential()

model.add(Embedding(TOTAL_VOCAB,EMBEDDING_LEN,input_length = int(upper_threshold)))
model.add(Conv1D(128,3,padding = 'same'))
model.add(Conv1D(64,3,padding = 'same'))
model.add(Conv1D(32,2,padding = 'same'))
model.add(Conv1D(16,2,padding = 'same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(100,activation = 'sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 265, 32)           160000    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 265, 128)          12416     
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 265, 64)           24640     
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 265, 32)           4128      
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 265, 16)           1040      
_________________________________________________________________
flatten_2 (Flatten)          (None, 4240)              0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 4240)              0         
__________

In [30]:
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

In [31]:
model.fit(X_train,y_train,validation_data = (X_val,y_val),epochs = 3,batch_size = 64)

Train on 24011 samples, validate on 7238 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff56798cf98>