# Using LSTM Network

In [1]:
import numpy as np
import pandas as pd

pos_train_data = pd.read_csv('train_pos.tsv',sep = '\t')
neg_train_data = pd.read_csv('train_neg.tsv',sep = '\t')
pos_test_data = pd.read_csv('test_pos.tsv',sep = '\t')
neg_test_data = pd.read_csv('test_neg.tsv',sep = '\t')

In [2]:
pos_train_data = pos_train_data[['Text','Sentiment']]
neg_train_data = neg_train_data[['Text','Sentiment']]
pos_test_data = pos_test_data[['Text','Sentiment']]
neg_test_data = neg_test_data[['Text','Sentiment']]

In [3]:
data_train = pd.concat([pos_train_data,neg_train_data],ignore_index = True)
data_train = data_train.sample(frac=1).reset_index(drop=True)
data_train.head()

Unnamed: 0,Text,Sentiment
0,I just wanna say that amongst all the so-calle...,1
1,A friend once told me that an art-house indepe...,0
2,I saw this movie a couple years back. I could'...,0
3,But this movie was a bore. The history part wa...,0
4,"This guy has no idea of cinema. Okay, it seems...",0


In [4]:
len(data_train)

25000

In [5]:
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=1).reset_index(drop=True)
data_test.head()

Unnamed: 0,Text,Sentiment
0,"This isn't a dreadful film, merely insipid. Th...",0
1,One of master director Alfred Hitchcock's fine...,1
2,The original Female Convict Scorpion is an all...,1
3,"Out of boredom and vast curiosity, I decided t...",0
4,the real plot...<br /><br />A group of post-Ci...,0


In [131]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', punctuation)

def textclean(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if not word in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [132]:
word_tokenize("What's the matter?")

['What', "'s", 'the', 'matter', '?']

In [133]:
reviews = []

for index,row in data_train.iterrows():
    text = (row['Text'].lower())    
    reviews.append(textclean(text))
reviews[0]

['wan',
 'na',
 'say',
 'amongst',
 'classic',
 'hiphop',
 'films',
 'ive',
 'seen',
 'like',
 'wild',
 'style',
 'krushgroove',
 'breakin',
 'style',
 'wars',
 'etc',
 'imo',
 'beat',
 'street',
 'best',
 'amongst',
 'others',
 'whenever',
 'ask',
 'people',
 'fave',
 'seems',
 'beat',
 'street',
 'pops',
 'still',
 'lowest',
 'ranked',
 'punch',
 'belt',
 'say',
 'points',
 'belt',
 'love',
 'music',
 'performances',
 'breakdancing',
 'makes',
 'wan',
 'na',
 'spin',
 'ramo',
 'makes',
 'wan',
 'na',
 'throw',
 'piece',
 'classic']

In [134]:
import itertools
linked_reviews = list(itertools.chain.from_iterable(reviews))

vocab_freq = dict()

linked_reviews[1]

'na'

In [135]:
for word in linked_reviews:
    if word not in vocab_freq:
        vocab_freq[word] = 1
    else:
        vocab_freq[word] += 1

In [136]:
vocab_freq

{'sites': 38,
 'drea': 2,
 'neelix': 3,
 'psychobilly': 1,
 'sincere': 85,
 'retentive': 2,
 'siodmak': 18,
 'concerning': 115,
 'maître': 1,
 'heller': 4,
 'buttercream': 1,
 'friel': 12,
 'downgrades': 2,
 'ehh': 3,
 'oriental': 32,
 'emery': 6,
 'unreviewed': 1,
 'afficionados': 2,
 'latrine': 3,
 'reins': 7,
 'brahms': 1,
 'touchings': 1,
 'moffat': 4,
 'anesthesia': 8,
 'bombshell': 10,
 'interesting': 3062,
 'getz': 4,
 'symmetric': 1,
 'tykes': 3,
 'favorite': 1221,
 'mulcahy': 4,
 'kravitz': 1,
 'helfer': 1,
 'midriff': 3,
 'fumiya': 2,
 'tenths': 1,
 'perdu': 2,
 'anderson': 218,
 'uninhibited': 14,
 'missions': 36,
 'polnareff': 1,
 'impulses': 14,
 'cindi': 1,
 'sensitivities': 1,
 'campfire': 22,
 'kanpur': 1,
 'anamorphic': 14,
 'bodily': 17,
 'fooled': 92,
 'leeli': 1,
 'expresso': 1,
 'deadhead': 1,
 'tourist': 55,
 'premium': 9,
 'dreads': 4,
 'immobilize': 1,
 'beecham': 1,
 'weighing': 4,
 'assuage': 1,
 'damningly': 1,
 'chile': 21,
 'silvio': 16,
 'poet': 35,
 'sugi

In [137]:
import operator

sorted_vocab_freq = list(reversed(sorted(vocab_freq.items(), key=operator.itemgetter(1))))

In [138]:
len(sorted_vocab_freq)

71238

In [139]:
TOTAL_VOCAB = 5000

word_to_id = dict()
id_to_word = dict()
for i in range(TOTAL_VOCAB):
    word_to_id[sorted_vocab_freq[i][0]] = i
    id_to_word[i] = sorted_vocab_freq[i][0]

In [140]:
id_to_word[0]

'br'

In [141]:
review_lengths = pd.DataFrame([len(review) for review in reviews])
review_lengths.columns = ['Len']

review_lengths

Unnamed: 0,Len
0,54
1,224
2,74
3,62
4,55
5,66
6,78
7,70
8,72
9,60


In [142]:
review_lengths.describe()

Unnamed: 0,Len
count,25000.0
mean,118.36848
std,89.42677
min,4.0
25%,63.0
50%,88.0
75%,144.0
max,1409.0


In [143]:
#Removal of outliers using Tukey's Method
first_q = review_lengths.Len.quantile([0.25])[0.25]
third_q = review_lengths.Len.quantile([0.75])[0.75]

upper_threshold = third_q + 1.5*(third_q-first_q)
lower_threshold = first_q - 1.5*(third_q-first_q)

upper_threshold,lower_threshold

(265.5, -58.5)

In [144]:
def convert(l):
    new_l = []
    for word in l:
        if word in word_to_id:
            new_l.append(word_to_id[word])
    return new_l

In [145]:
len(reviews)

25000

In [146]:
len(data_train['Sentiment'])

25000

In [162]:
X_train = []
y_train = []

for i in range(len(data_train)):
    converted_review = convert(reviews[i])
    if len(converted_review) <= upper_threshold:
        X_train.append(converted_review)
        y_train.append(data_train['Sentiment'][i])

In [163]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [164]:
import keras
from keras.preprocessing import sequence

X_train = sequence.pad_sequences(X_train, maxlen=int(upper_threshold),value = 0)

In [165]:
X_train.shape,y_train.shape

((24010, 265), (24010,))

In [166]:
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=0.3).reset_index(drop=True)

validation_reviews = []

for index,row in data_test.iterrows():
    text = (row['Text'].lower())
    validation_reviews.append(textclean(text))
    
X_val = []
y_val = []

for i in range(len(data_test)):
    converted_review = convert(validation_reviews[i])
    if len(converted_review) <= upper_threshold:
        X_val.append(converted_review)
        y_val.append(data_test['Sentiment'][i])
        
X_val = np.array(X_val)
X_val = sequence.pad_sequences(X_val, maxlen=int(upper_threshold),value = 0)
y_val = np.array(y_val)

In [167]:
X_val.shape

(7235, 265)

In [172]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout,Activation
from keras.layers import Embedding

EMBEDDING_LEN = 32

model = Sequential()

model.add(Embedding(TOTAL_VOCAB,EMBEDDING_LEN,input_length = int(upper_threshold)))
model.add(LSTM(100,dropout=0.3,recurrent_dropout=0.2))

model.add(Dense(1,activation = 'sigmoid'))

In [173]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 265, 32)           160000    
_________________________________________________________________
lstm_14 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


In [174]:
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

In [175]:
model.fit(X_train,y_train,validation_data = (X_val,y_val),epochs = 3,batch_size = 64)

Train on 24010 samples, validate on 7235 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f988ee05f98>