In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd       
train = pd.read_csv("./Kaggle/Bag of Words/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [2]:
train.shape

(25000, 3)

In [3]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [5]:
from bs4 import BeautifulSoup
import re

In [14]:
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))

In [16]:
# defines a tokenizer and stemmer which returns the set of stems in the text that it is passed
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [17]:
def preprocess_review( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case
    lower = letters_only.lower()
    
    return lower

In [20]:
print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = TfidfVectorizer(tokenizer = tokenize_and_stem,    \
                             preprocessor = preprocess_review, \
                             stop_words = stopwords,   \
                             max_features = 5000, \
                             ngram_range=(1,3), \
                             use_idf=True) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(train['review'])

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [21]:
print train_data_features.shape

(25000, 5000)


In [22]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab

[u'abandon', u'abc', u'abil', u'abl', u'abort', u'abov', u'abraham', u'abrupt', u'absenc', u'absolut', u'absolut love', u'absolut noth', u'absorb', u'absurd', u'abus', u'abysm', u'academi', u'academi award', u'accent', u'accept', u'access', u'accid', u'accident', u'acclaim', u'accompani', u'accomplish', u'accord', u'account', u'accur', u'accus', u'achiev', u'acid', u'acknowledg', u'acquir', u'across', u'act', u'act bad', u'act direct', u'act film', u'act good', u'act great', u'act like', u'act movi', u'act perform', u'act terribl', u'act veri', u'act well', u'action', u'action film', u'action movi', u'action scene', u'action sequenc', u'activ', u'actor', u'actor actress', u'actor film', u'actor like', u'actor movi', u'actor play', u'actress', u'actress play', u'actual', u'ad', u'adam', u'adapt', u'add', u'addict', u'addit', u'address', u'adequ', u'admir', u'admit', u'adolesc', u'adopt', u'ador', u'adult', u'advanc', u'advantag', u'adventur', u'advertis', u'advic', u'advis', u'affair', 

In [23]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag

24.8260285734 abandon
17.5741178157 abc
43.9361853599 abil
89.0038909853 abl
9.7715060042 abort
63.4222016914 abov
10.6019413754 abraham
13.7625364532 abrupt
12.2995154077 absenc
135.627259987 absolut
13.8479434579 absolut love
17.7067677563 absolut noth
16.1156768138 absorb
38.9491976431 absurd
34.9976331623 abus
13.9311278692 abysm
28.6886708593 academi
17.656576289 academi award
59.9104253792 accent
58.9369118065 accept
16.8792854163 access
31.5975716623 accid
22.7607625288 accident
13.5678700527 acclaim
17.6504487339 accompani
25.7544992573 accomplish
27.7759226137 accord
28.0366069056 account
34.2995645287 accur
18.7890970779 accus
46.2582153516 achiev
11.2609708839 acid
11.30107538 acknowledg
9.86057977235 acquir
70.4949241667 across
406.508772857 act
29.6405533933 act bad
13.9795394214 act direct
12.629552382 act film
17.6741802097 act good
14.7386645316 act great
26.2628455714 act like
14.7718196862 act movi
9.72655085041 act perform
14.5824987831 act terribl
16.6968538862 act 

In [24]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["sentiment"] )

Training the random forest...


In [25]:
# Read the test data
test = pd.read_csv("Kaggle/Bag of Words/testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

# Verify that there are 25,000 rows and 2 columns
print test.shape

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(test["review"])
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Kaggle/Bag of Words/Bag_of_Words_model_submission2.csv", index=False, quoting=3 )

(25000, 2)
