In [1]:
import re
import time

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd      

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score
from gensim.models import Word2Vec

import nltk
from nltk.corpus import stopwords 

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("TrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [3]:
data.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
data.shape

(25000, 3)

In [5]:
train,test = train_test_split(data,test_size = 0.3,random_state = 1)

In [6]:
train.head()

Unnamed: 0,id,sentiment,review
4289,"""4456_1""",0,"""This is possibly the worst film I've ever see..."
19621,"""914_3""",0,"""(Review in English, since Swedish is not allo..."
14965,"""734_10""",1,"""This film is a jolt of punk rock fun, from st..."
12321,"""10061_4""",0,"""'The Curse of Frankenstein' sticks faithfully..."
6269,"""4556_8""",1,"""I watched this film in shire joy.<br /><br />..."


In [7]:
train.reset_index(inplace= True)

In [8]:
train.head()

Unnamed: 0,index,id,sentiment,review
0,4289,"""4456_1""",0,"""This is possibly the worst film I've ever see..."
1,19621,"""914_3""",0,"""(Review in English, since Swedish is not allo..."
2,14965,"""734_10""",1,"""This film is a jolt of punk rock fun, from st..."
3,12321,"""10061_4""",0,"""'The Curse of Frankenstein' sticks faithfully..."
4,6269,"""4556_8""",1,"""I watched this film in shire joy.<br /><br />..."


In [9]:
train.shape, test.shape

((17500, 4), (7500, 3))

In [10]:
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      BeautifulSoup(train["review"][0])  .get_text() )  # The text to search
print(letters_only)

 This is possibly the worst film I ve ever seen  The fact that it has a flimsy storyline is bad enough  that they ve hooked it around the subject of football violence makes it     times worse I had severe doubts about the premise of this film even before I started watching  but went into it open minded enough even to accept the way that the writers saw fit to introduce Elijah Wood s character Matt into the hooligan scene But the film throws up inaccuracy after inaccuracy  to the point that by the middle of the film each one makes you cringe harder than the time before Let s clear up a few things  Hooligans don t tend to virtually smash up their own pub before a run of the mill league game  they don t set out to kill each other  they don t ONLY wear Stone Island  and others in the crowd  hooligans or not  do   They most certainly don t  when having taken exception to a new firm member  trot off to their rival firms territory for pie and mash  And I d love to meet the hool who would go a

In [11]:
print(stopwords.words("english"))

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [12]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [13]:
num_reviews = train["review"].size
print(num_reviews)

17500


In [14]:
clean_train_reviews = []
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d" % ( i+1, num_reviews ))                                                                 
    clean_train_reviews.append( review_to_words( train["review"][i] ))

Review 1000 of 17500
Review 2000 of 17500
Review 3000 of 17500
Review 4000 of 17500
Review 5000 of 17500
Review 6000 of 17500
Review 7000 of 17500
Review 8000 of 17500
Review 9000 of 17500
Review 10000 of 17500
Review 11000 of 17500
Review 12000 of 17500
Review 13000 of 17500
Review 14000 of 17500
Review 15000 of 17500
Review 16000 of 17500
Review 17000 of 17500


In [16]:
clean_train_reviews[0]

u'possibly worst film ever seen fact flimsy storyline bad enough hooked around subject football violence makes times worse severe doubts premise film even started watching went open minded enough even accept way writers saw fit introduce elijah wood character matt hooligan scene film throws inaccuracy inaccuracy point middle film one makes cringe harder time let clear things hooligans tend virtually smash pub run mill league game set kill wear stone island others crowd hooligans certainly taken exception new firm member trot rival firms territory pie mash love meet hool would go grass firm top boy rival firm although scratch said setting kill one exist get wrong yet see film subject contain fantasy whims par firm cluelessness found ironical wood american nemesis morally condemned character cocaine user part parcel british hooligan scene film chooses challenge wood morals instead steers clear firm using coke could go think made point plot highly unimaginative sure spent entire film bemo

In [17]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print(train_data_features.shape)

(17500, 5000)


In [18]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print(vocab[1:100])

[u'abc', u'abilities', u'ability', u'able', u'abraham', u'absence', u'absent', u'absolute', u'absolutely', u'absorbed', u'absurd', u'absurdity', u'abuse', u'abusive', u'abysmal', u'academy', u'accent', u'accents', u'accept', u'acceptable', u'accepted', u'access', u'accident', u'accidentally', u'accompanied', u'accomplished', u'according', u'account', u'accuracy', u'accurate', u'accused', u'achieve', u'achieved', u'achievement', u'acid', u'across', u'act', u'acted', u'acting', u'action', u'actions', u'active', u'activities', u'actor', u'actors', u'actress', u'actresses', u'acts', u'actual', u'actually', u'ad', u'adam', u'adams', u'adaptation', u'adaptations', u'adapted', u'add', u'added', u'adding', u'addition', u'adds', u'adequate', u'admire', u'admit', u'admittedly', u'adolescent', u'adorable', u'adult', u'adults', u'advance', u'advanced', u'advantage', u'adventure', u'adventures', u'advertising', u'advice', u'advise', u'affair', u'affect', u'affected', u'afford', u'aforementioned', u

In [20]:
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for count, tag in sorted([(count, tag) for tag, count in zip(vocab, dist)], reverse=True)[1:20]:
    print(count, tag)

(28160, u'film')
(18733, u'one')
(14231, u'like')
(10739, u'good')
(8896, u'even')
(8888, u'time')
(8669, u'would')
(8526, u'story')
(8298, u'really')
(7933, u'see')
(7492, u'well')
(6886, u'much')
(6542, u'bad')
(6490, u'get')
(6476, u'also')
(6444, u'first')
(6442, u'people')
(6393, u'great')
(5834, u'made')


In [35]:
%%time
print("Training the random forest...")
# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 200, n_jobs=2, verbose =1) 
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train['sentiment'] )

Training the random forest...


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   16.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  1.1min


CPU times: user 2min 19s, sys: 316 ms, total: 2min 19s
Wall time: 1min 13s


[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:  1.2min finished


In [36]:
print(test.shape)
# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])
clean_test_reviews = [] 

(7500, 4)


In [37]:
test.reset_index(inplace=True)

In [38]:
test.head()

Unnamed: 0,level_0,index,id,sentiment,review
0,0,21492,"""2161_10""",1,"""How many of us wish that we could throw away ..."
1,1,9488,"""4950_8""",1,"""Knowing when to end a movie is just as import..."
2,2,16933,"""4942_7""",1,"""I have to admit that for the first half hour ..."
3,3,12604,"""668_7""",1,"""I just watched \""The Last Wave\"" in my school..."
4,4,8222,"""8689_7""",1,"""Perfect cast for a few-person drama. Simon is..."


In [39]:
print("Cleaning and parsing the test set movie reviews...")
for i in range(0,num_reviews):
    if( (i+1) % 5000 == 0 ):
        print("Review %d of %d" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

Cleaning and parsing the test set movie reviews...
Review 5000 of 7500


In [40]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()


In [41]:
test_data_features.shape

(7500, 5000)

In [42]:
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.9s finished


In [43]:
result.shape

(7500,)

In [44]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame({"id":test["id"], "sentiment":result} )

In [45]:
output.head()

Unnamed: 0,id,sentiment
0,"""2161_10""",1
1,"""4950_8""",1
2,"""4942_7""",0
3,"""668_7""",0
4,"""8689_7""",1


In [46]:
test.head()

Unnamed: 0,level_0,index,id,sentiment,review
0,0,21492,"""2161_10""",1,"""How many of us wish that we could throw away ..."
1,1,9488,"""4950_8""",1,"""Knowing when to end a movie is just as import..."
2,2,16933,"""4942_7""",1,"""I have to admit that for the first half hour ..."
3,3,12604,"""668_7""",1,"""I just watched \""The Last Wave\"" in my school..."
4,4,8222,"""8689_7""",1,"""Perfect cast for a few-person drama. Simon is..."


In [47]:
from sklearn.metrics import accuracy_score

In [48]:
accuracy_score(test['sentiment'],output['sentiment'] )

0.84813333333333329