In [1]:
import numpy as np
import pandas as pd

pos_train_data = pd.read_csv('train_pos.tsv',sep = '\t')
neg_train_data = pd.read_csv('train_neg.tsv',sep = '\t')
pos_test_data = pd.read_csv('test_pos.tsv',sep = '\t')
neg_test_data = pd.read_csv('test_neg.tsv',sep = '\t')

In [2]:
pos_train_data = pos_train_data[['Text','Sentiment']]
neg_train_data = neg_train_data[['Text','Sentiment']]
pos_test_data = pos_test_data[['Text','Sentiment']]
neg_test_data = neg_test_data[['Text','Sentiment']]

In [3]:
data_train = pd.concat([pos_train_data,neg_train_data],ignore_index = True)
data_train = data_train.sample(frac=1).reset_index(drop=True)
data_train.head()

Unnamed: 0,Text,Sentiment
0,Centered in the downtown and out skirts of Det...,1
1,The pilot of Enterprise has one thing that has...,1
2,The competition for the worst Warner Bros Kay ...,0
3,"I don't know much about Tobe Hooper, or why he...",0
4,"Well, not yet, at least.<br /><br />It's not l...",0


In [4]:
len(data_train)

25000

In [5]:
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=1).reset_index(drop=True)
data_test.head()

Unnamed: 0,Text,Sentiment
0,The world now seems to be in an odd stage of d...,0
1,My yardstick for measuring a movie's watch-abi...,1
2,... So some people might argue that this can't...,0
3,Every time I watch this movie blood comes gush...,0
4,John Leguizamo's one man shows are hit or miss...,1


In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', punctuation)

def textclean(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if not word in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [7]:
reviews = []

for index,row in data_train.iterrows():
    text = (row['Text'].lower())    
    reviews.append(textclean(text))
reviews[0]

['centered',
 'downtown',
 'skirts',
 'detroit',
 'comedy',
 'found',
 'terrific',
 'new',
 'comedic',
 'duo',
 'pat',
 'morita',
 'funny',
 'man',
 'happens',
 'cop',
 'japan',
 'trail',
 'industrial',
 'secrets',
 'thief',
 'stolen',
 'type',
 'turbo',
 'super',
 'charger',
 'reluctantly',
 'goes',
 'united',
 'states',
 'follow',
 'thief',
 'ordered',
 'commander',
 'pat',
 'character',
 'collides',
 'leno',
 'character',
 'fast',
 'talking',
 'type',
 'detroit',
 'cop',
 'cross',
 'paths',
 'though',
 'honorable',
 'japan',
 'meet',
 'old',
 'school',
 'detroit',
 'police',
 'investigative',
 'two',
 'stumble',
 'trip',
 'first',
 'develop',
 'turns',
 'explosive',
 'two',
 'layered',
 'powerhouse',
 'team',
 'solves',
 'case',
 'cold',
 'battling',
 'city',
 'crime',
 'boss',
 'stolen',
 'closing',
 'case',
 'two',
 'go',
 'despising',
 'friends',
 'working',
 'well',
 'together',
 'little',
 'worse',
 'wear',
 'need',
 'top',
 'manage',
 'come',
 'victorious',
 'closing',
 'rated

In [19]:
import gensim
from gensim.models.doc2vec import 

d2v_reviews = []
for i in range(len(reviews)):
    d2v_reviews.append(TaggedDocument(words=reviews[i], tags=['REV_'+str(i)]))

In [21]:
d2v_reviews[25]

TaggedDocument(words=['charming', 'even', 'begin', 'describe', 'saving', 'grace', 'absolutely', 'irresistible', 'anyone', 'ventures', 'movie', 'leave', 'spirits', 'soaring', 'high', 'haha', 'br', 'br', 'grace', 'trevethyn', 'brenda', 'blethyn', 'lost', 'husband', 'problems', 'get', 'whole', 'lot', 'worse', 'dearly', 'departed', 'left', 'money', 'outstanding', 'debts', 'faced', 'losing', 'everything', 'find', 'way', 'get', 'lot', 'cash', 'fast', 'gets', 'idea', 'gardener', 'matthew', 'craig', 'ferguson', 'asks', 'horticulturist', 'give', 'advice', 'plant', 'secretly', 'growing', 'grace', 'immediately', 'realizes', 'plant', 'marijuana', 'decide', 'use', 'gardening', 'skills', 'grow', 'lot', 'weed', 'sell', 'pay', 'outstanding', 'br', 'br', 'notable', 'quality', 'saving', 'grace', 'likability', 'every', 'character', 'extremely', 'sympathetic', 'save', 'first', 'minutes', 'film', 'good', 'cheer', 'everyone', 'wants', 'happy', 'ending', 'everyone', 'even', 'means', 'turning', 'blind', 'eye'

In [22]:
import gensim
from gensim.models import Doc2Vec

n_dim = 30

d2v_model = Doc2Vec(d2v_reviews,size=30)

In [32]:
d2v_model.docvecs['REV_3']

array([ 0.0631503 ,  0.0511961 ,  0.05992578,  0.0918224 ,  0.08400291,
       -0.03525776,  0.04899835,  0.0027545 , -0.03045426,  0.06144393,
       -0.08076708, -0.00165234,  0.07002113,  0.05277919,  0.03682449,
        0.04821666,  0.00528483, -0.03040392,  0.07209196, -0.02712625,
        0.0763872 ,  0.17796735, -0.05163846, -0.04621296, -0.06108721,
       -0.03243967,  0.03553853,  0.11175599,  0.04035437, -0.00036282], dtype=float32)

In [33]:
len(d2v_model.docvecs)

25000

In [34]:
X_train = []
y_train = []

for i in range(len(data_train)):
    X_train.append(d2v_model.docvecs['REV_'+str(i)])
    y_train.append(data_train['Sentiment'][i])

In [36]:
from sklearn.svm import SVC

clf = SVC()
clf = clf.fit(X_train,y_train)

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform(reviews)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [38]:
def create_word_vector(l,size):
    vector = np.zeros(size).reshape((1,size))
    count = 0.
    for word in l:
        try:
            vector += d2v_model.wv[word].reshape((1, size)) * tfidf[word]
            count+=1
        except KeyError:
            continue
            
    if count!=0:
        vector /= count
    return vector  

In [42]:
from sklearn.preprocessing import scale

data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=0.3).reset_index(drop=True)

validation_reviews = []

for index,row in data_test.iterrows():
    text = (row['Text'].lower())
    validation_reviews.append(textclean(text))
    
X_val = []
y_val = []

for i in range(len(data_test)):
    converted_review = create_word_vector(validation_reviews[i],n_dim)
    X_val.append(converted_review)
    y_val.append(data_test['Sentiment'][i])
        
X_val = np.concatenate(X_val)
X_val = scale(X_val)
y_val = np.array(y_val)

In [57]:
print(clf.predict(X_val[4].reshape(1,-1)))

[0]


In [60]:
data_test['Text'][4]

"I didn't hate this movie as much as some on my all time black list, but I consider it a total wast of film. Jeremy Irons, Iron Jeremy, Ron Jeremy. Think about it. Scene one is very good, all the rest are crap."

In [62]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_val,clf.predict(X_val)))

0.795733333333
