In [9]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd       
train = pd.read_csv("./Kaggle/Bag of Words/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [10]:
train.shape

(25000, 3)

In [11]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [12]:
from bs4 import BeautifulSoup
import re

In [13]:
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))

In [14]:
# defines a tokenizer and stemmer which returns the set of stems in the text that it is passed
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [15]:
def preprocess_review( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Convert to lower case
    return review_text.lower()

In [16]:
print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(tokenizer = tokenize_and_stem,    \
                             preprocessor = preprocess_review, \
                             stop_words = stopwords,   \
                             max_features = 5000, \
                             ngram_range=(1,3)) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(train['review'])

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [17]:
print train_data_features.shape

(25000, 5000)


In [18]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab

[u"'d", u"'d like", u"'d rather", u"'d say", u"'m", u"'m big", u"'m glad", u"'m go", u"'m sorri", u"'m still", u"'m sure", u"'s", u"'s 's", u"'s act", u"'s actual", u"'s almost", u"'s also", u"'s alway", u"'s amaz", u"'s anoth", u"'s bad", u"'s beauti", u"'s becaus", u"'s best", u"'s better", u"'s big", u"'s book", u"'s brother", u"'s career", u"'s charact", u"'s classic", u"'s daughter", u"'s death", u"'s definit", u"'s direct", u"'s done", u"'s easi", u"'s end", u"'s even", u"'s eye", u"'s face", u"'s famili", u"'s father", u"'s film", u"'s final", u"'s first", u"'s friend", u"'s fun", u"'s funni", u"'s get", u"'s go", u"'s good", u"'s got", u"'s great", u"'s greatest", u"'s hard", u"'s head", u"'s hous", u"'s interest", u"'s kind", u"'s last", u"'s life", u"'s like", u"'s littl", u"'s lot", u"'s love", u"'s made", u"'s make", u"'s mother", u"'s movi", u"'s much", u"'s music", u"'s name", u"'s never", u"'s new", u"'s nice", u"'s noth", u"'s novel", u"'s obvious", u"'s one", u"'s onli

In [19]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag

2337 'd
201 'd like
93 'd rather
145 'd say
4755 'm
97 'm big
167 'm glad
207 'm go
136 'm sorri
108 'm still
715 'm sure
62067 's
247 's 's
167 's act
164 's actual
192 's almost
473 's also
160 's alway
101 's amaz
126 's anoth
433 's bad
107 's beauti
156 's becaus
410 's best
121 's better
92 's big
113 's book
106 's brother
91 's career
907 's charact
95 's classic
122 's daughter
182 's death
114 's definit
142 's direct
130 's done
115 's easi
103 's end
277 's even
108 's eye
231 's face
114 's famili
195 's father
518 's film
97 's final
285 's first
116 's friend
108 's fun
217 's funni
130 's get
336 's go
518 's good
258 's got
403 's great
96 's greatest
389 's hard
131 's head
109 's hous
157 's interest
172 's kind
126 's last
331 's life
473 's like
162 's littl
190 's lot
179 's love
94 's made
104 's make
130 's mother
429 's movi
255 's much
109 's music
171 's name
128 's never
119 's new
110 's nice
266 's noth
107 's novel
163 's obvious
447 's one
440 's onli
14

In [20]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["sentiment"] )

Training the random forest...


In [21]:
# Read the test data
test = pd.read_csv("Kaggle/Bag of Words/testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

# Verify that there are 25,000 rows and 2 columns
print test.shape

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(test["review"])
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Kaggle/Bag of Words/Bag_of_Words_model_submission3.csv", index=False, quoting=3 )

(25000, 2)
