In [1]:
# read the data for training
import pandas as pd

# header = 0 to use row 0 as header
# delimeter '\t' to indicate fields are seperated by tabs
# quoting=3 to ignore double quotes while reading the file
data = pd.read_csv('labeledTrainData.tsv', header=0, quoting=3, delimiter='\t')
train = data['review']
train_label = data['sentiment']

Exploring the data

it has 50,000 rows with two clumns (id, reviews)

In [2]:
train.head()

0    "With all this stuff going down at the moment ...
1    "\"The Classic War of the Worlds\" by Timothy ...
2    "The film starts with a manager (Nicholas Bell...
3    "It must be assumed that those who praised thi...
4    "Superbly trashy and wondrously unpretentious ...
Name: review, dtype: object

In [3]:
train.shape

(25000L,)

## Data cleansing
we need to clean these reviews (html tags) and make them ready to be used for machine learning.

In [5]:
# CLEANING THE REVIEWS OF MARKUP TAGS

from bs4 import BeautifulSoup
# using BeautifulSoup is considered a more reliable than regex to remove markup tags

# initialize beautifulsoup on a single review text
example1 = BeautifulSoup(train[1])

# compare the data
print 'Modified--------'
print example1.get_text()
print 'Original--------'
print train[1]

Modified--------
"\"The Classic War of the Worlds\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \"critics\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \"critics\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \"critics\" perceive to be its shortcomings."
Original--------
"\"The Classic Wa

## Cleaning text of punctuations, stopwords and numbers

While cleaning a text we should keep in mind the data problem we are trying to solve.
For many  many problems cleaning of punctuation make sense. But for sentiment analysis puctuations can carry sentimens [ :-) !!]. For simplicity we are going to remove punctuations and numbers. Although we can replace numbers with placeholders ( NUM_1).


In [6]:
# to find and replace we'll use regex
import re

only_words = re.sub('[^a-zA-Z]',         # pattern to search for
                    ' ',                 # pattern to replace with
                    example1.get_text()) # text to search

only_words = only_words.lower()          # convert to lowercase

print only_words

words = only_words.split()               # split into words

   the classic war of the worlds   by timothy hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate h  g  wells  classic book  mr  hines succeeds in doing so  i  and those who watched his film with me  appreciated the fact that it was not the standard  predictable hollywood fare that comes out every year  e g  the spielberg version with tom cruise that had only the slightest resemblance to the book  obviously  everyone looks for different things in a movie  those who envision themselves as amateur   critics   look only to criticize everything they can  others rate a movie on more important bases like being entertained  which is why most people never agree with the   critics    we enjoyed the effort mr  hines put into being faithful to h g  wells  classic novel  and we found it to be very entertaining  this made it easy to overlook what the   critics   perceive to be its shortcomings  


In [7]:
import nltk
#nltk.download()  # download text datasets including stop words

In [8]:
from nltk.corpus import stopwords
#print stopwords.words('english')

# remove stopwords from words
words = [word for word in words if not word in stopwords.words('english')]
print words

# u is just indicating that python is internally representing each word as unicode string.
# potter stemming and lemmatizing can used, to consider word like message, messaging same.
# they are the part of nltk

[u'classic', u'war', u'worlds', u'timothy', u'hines', u'entertaining', u'film', u'obviously', u'goes', u'great', u'effort', u'lengths', u'faithfully', u'recreate', u'h', u'g', u'wells', u'classic', u'book', u'mr', u'hines', u'succeeds', u'watched', u'film', u'appreciated', u'fact', u'standard', u'predictable', u'hollywood', u'fare', u'comes', u'every', u'year', u'e', u'g', u'spielberg', u'version', u'tom', u'cruise', u'slightest', u'resemblance', u'book', u'obviously', u'everyone', u'looks', u'different', u'things', u'movie', u'envision', u'amateur', u'critics', u'look', u'criticize', u'everything', u'others', u'rate', u'movie', u'important', u'bases', u'like', u'entertained', u'people', u'never', u'agree', u'critics', u'enjoyed', u'effort', u'mr', u'hines', u'put', u'faithful', u'h', u'g', u'wells', u'classic', u'novel', u'found', u'entertaining', u'made', u'easy', u'overlook', u'critics', u'perceive', u'shortcomings']


In [9]:
# wrapping all part of pre-processing in one function
def review_to_words(raw_review):
    '''
    Function to process a raw review and return a pre-processed movie review
    '''
    # 1.Remove html tags
    review_text = BeautifulSoup(raw_review).get_text()
    # 2.Remove punctuations and numbers
    all_words = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3.Make text case-insensitive and split to words
    words = all_words.lower().split()
    # 4.Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

In [10]:
print 'Before processing----------'
print example1.get_text()
print 'After processing-----------'
review_to_words(example1.get_text())

Before processing----------
"\"The Classic War of the Worlds\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \"critics\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \"critics\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \"critics\" perceive to be its shortcomings."
After processing-------

u'classic war worlds timothy hines entertaining film obviously goes great effort lengths faithfully recreate h g wells classic book mr hines succeeds watched film appreciated fact standard predictable hollywood fare comes every year e g spielberg version tom cruise slightest resemblance book obviously everyone looks different things movie envision amateur critics look criticize everything others rate movie important bases like entertained people never agree critics enjoyed effort mr hines put faithful h g wells classic novel found entertaining made easy overlook critics perceive shortcomings'

In [11]:
import time

print "Looping over and cleaning all reviews"

# 1.get the size of review column
review_len = train.size
# 2.initialize an empty list to hold all cleaned reviews
clean_review = []
start_time = time.time()
# 3.loop over all reviews
for i in range(0, review_len, 1):
    # 4.use function review _to_words to get a processed review
    clean_review.append(review_to_words(train[i]))
    # 5.print the status
    if (i+1) % 10000 == 0:
        print '{}/{} reviews processed. Time elapsed: {} sec'.format(i, review_len, (time.time()-start_time))
        start_time = time.time()
# 6.print final status

Looping over and cleaning all reviews
9999/25000 reviews processed. Time elapsed: 51.0209999084 sec
19999/25000 reviews processed. Time elapsed: 50.4140000343 sec


### Creating features from bag of words

In [12]:
# we are going to select 5000 most frequent words as our vacabulary
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word',
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=5000)
# fit_transform does two things:
# fits the model and learn the vocab
# transform text to feature vectors
train_data_features = vectorizer.fit_transform(clean_review)

# easy to deal with numpy array
train_data_features = train_data_features.toarray()

In [13]:
train_data_features.shape

(25000L, 5000L)

In [14]:
# words in the vocabulary
vocab = vectorizer.get_feature_names()
vocab

[u'abandoned',
 u'abc',
 u'abilities',
 u'ability',
 u'able',
 u'abraham',
 u'absence',
 u'absent',
 u'absolute',
 u'absolutely',
 u'absurd',
 u'abuse',
 u'abusive',
 u'abysmal',
 u'academy',
 u'accent',
 u'accents',
 u'accept',
 u'acceptable',
 u'accepted',
 u'access',
 u'accident',
 u'accidentally',
 u'accompanied',
 u'accomplished',
 u'according',
 u'account',
 u'accuracy',
 u'accurate',
 u'accused',
 u'achieve',
 u'achieved',
 u'achievement',
 u'acid',
 u'across',
 u'act',
 u'acted',
 u'acting',
 u'action',
 u'actions',
 u'activities',
 u'actor',
 u'actors',
 u'actress',
 u'actresses',
 u'acts',
 u'actual',
 u'actually',
 u'ad',
 u'adam',
 u'adams',
 u'adaptation',
 u'adaptations',
 u'adapted',
 u'add',
 u'added',
 u'adding',
 u'addition',
 u'adds',
 u'adequate',
 u'admire',
 u'admit',
 u'admittedly',
 u'adorable',
 u'adult',
 u'adults',
 u'advance',
 u'advanced',
 u'advantage',
 u'adventure',
 u'adventures',
 u'advertising',
 u'advice',
 u'advise',
 u'affair',
 u'affect',
 u'affec

In [15]:
# visualize the count of each words in feature list
import numpy as np

dist = np.sum(train_data_features, axis=1)

for word, count in zip(vocab, dist):
    print '{}: {}'.format(word, count)

abandoned: 187
abc: 67
abilities: 169
ability: 122
able: 164
abraham: 41
absence: 51
absent: 55
absolute: 77
absolutely: 21
absurd: 20
abuse: 63
abusive: 145
abysmal: 59
academy: 49
accent: 85
accents: 172
accept: 195
acceptable: 93
accepted: 52
access: 100
accident: 112
accidentally: 98
accompanied: 21
accomplished: 19
according: 57
account: 92
accuracy: 110
accurate: 295
accused: 56
achieve: 23
achieved: 52
achievement: 58
acid: 140
across: 154
act: 140
acted: 56
acting: 58
action: 79
actions: 203
activities: 129
actor: 50
actors: 62
actress: 64
actresses: 78
acts: 50
actual: 30
actually: 47
ad: 103
adam: 89
adams: 70
adaptation: 42
adaptations: 83
adapted: 53
add: 21
added: 60
adding: 65
addition: 66
adds: 28
adequate: 172
admire: 223
admit: 96
admittedly: 52
adorable: 65
adult: 56
adults: 25
advance: 366
advanced: 28
advantage: 83
adventure: 51
adventures: 54
advertising: 179
advice: 77
advise: 85
affair: 58
affect: 63
affected: 107
afford: 40
aforementioned: 92
afraid: 55
africa: 

dogs: 113
doll: 86
dollar: 145
dollars: 170
dolls: 125
dolph: 39
domestic: 175
domino: 69
donald: 197
done: 172
donna: 84
doo: 22
doom: 145
doomed: 88
door: 145
doors: 135
dorothy: 72
double: 304
doubt: 28
doubts: 176
douglas: 50
downey: 57
downhill: 48
downright: 36
dozen: 46
dozens: 96
dr: 61
dracula: 134
drag: 101
dragged: 283
dragon: 86
drags: 83
drake: 55
drama: 138
dramas: 62
dramatic: 74
draw: 182
drawing: 69
drawn: 33
draws: 171
dreadful: 149
dream: 63
dreams: 56
dreary: 74
dreck: 105
dress: 95
dressed: 136
dressing: 52
drew: 203
drink: 148
drinking: 61
drive: 63
drivel: 118
driven: 72
driver: 62
drives: 31
driving: 61
drop: 80
dropped: 69
dropping: 36
drops: 337
drug: 69
drugs: 55
drunk: 44
drunken: 201
dry: 119
dub: 65
dubbed: 351
dubbing: 54
dud: 105
dude: 60
due: 62
duke: 78
dull: 62
dumb: 126
duo: 40
dust: 43
dutch: 135
duty: 98
dvd: 32
dying: 135
dynamic: 50
eager: 35
ear: 66
earl: 113
earlier: 178
early: 60
earned: 54
ears: 197
earth: 56
ease: 57
easier: 56
easily: 63
ea

previously: 100
prey: 74
price: 187
priceless: 310
pride: 55
priest: 47
primarily: 169
primary: 26
prime: 305
prince: 82
princess: 141
principal: 96
print: 222
prior: 56
prison: 114
prisoner: 51
prisoners: 84
private: 135
prize: 39
pro: 136
probably: 51
problem: 269
problems: 68
proceedings: 60
proceeds: 47
process: 49
produce: 33
produced: 59
producer: 119
producers: 53
producing: 77
product: 31
production: 259
productions: 245
professional: 66
professor: 120
profound: 295
program: 93
progress: 74
progresses: 21
project: 90
projects: 162
prom: 108
promise: 26
promised: 99
promises: 44
promising: 152
proof: 88
propaganda: 54
proper: 199
properly: 67
property: 44
props: 66
prostitute: 210
protagonist: 51
protagonists: 48
protect: 70
proud: 63
prove: 33
proved: 133
proves: 62
provide: 52
provided: 139
provides: 248
providing: 52
provoking: 335
pseudo: 116
psychiatrist: 54
psychic: 100
psycho: 153
psychological: 61
psychotic: 105
public: 56
pull: 98
pulled: 84
pulling: 52
pulls: 61
pulp: 

In [16]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)

clf.fit(train_data_features, train_label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Testing the model

In [17]:
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

print test.shape

num_reviews = len(test["review"])

print "Looping over and cleaning all reviews"
# 1.get the size of review column
# 2.initialize an empty list to hold all cleaned reviews
clean_test_reviews = []
start_time = time.time()
# 3.loop over all reviews
for i in range(0, num_reviews, 1):
    # 4.use function review _to_words to get a processed review
    clean_test_reviews.append(review_to_words(test['review'][i]))
    # 5.print the status
    if (i+1) % 10000 == 0:
        print '{}/{} reviews processed. Time elapsed: {} sec'.format(i, num_reviews, (time.time()-start_time))
        start_time = time.time()

(20064, 2)
Looping over and cleaning all reviews
9999/20064 reviews processed. Time elapsed: 50.2599999905 sec
19999/20064 reviews processed. Time elapsed: 51.4570000172 sec


In [18]:
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

result = clf.predict(test_data_features)

output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )