In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train = pd.read_csv('./data/train.csv')
test1 = pd.read_csv('./data/test1.csv')
test2 = pd.read_csv('./data/test2.csv')

In [35]:
# cnt = 0
# for i in xrange( 0, train.review.size ):
#     cnt+=1
#     print train["review"][i] 

In [3]:
print train.shape,test1.shape,test2.shape

(1729, 2) (179, 1) (76, 1)


In [7]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review,'lxml').get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [10]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size

print "Cleaning and parsing the training set movie reviews...\n"
clean_train_reviews = []
for i in xrange( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%100 == 0 ):
        print "Review %d of %d\n" % ( i+1, num_reviews )                                                                    
    clean_train_reviews.append( review_to_words( train["review"][i] ))

Cleaning and parsing the training set movie reviews...

Review 100 of 1729

Review 200 of 1729

Review 300 of 1729

Review 400 of 1729

Review 500 of 1729

Review 600 of 1729

Review 700 of 1729

Review 800 of 1729

Review 900 of 1729

Review 1000 of 1729

Review 1100 of 1729

Review 1200 of 1729

Review 1300 of 1729

Review 1400 of 1729

Review 1500 of 1729

Review 1600 of 1729

Review 1700 of 1729



In [11]:
print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [12]:
print train_data_features.shape

(1729, 4800)


In [14]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["label"] )

Training the random forest...


In [31]:
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
import numpy as np
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100) 
n_folds = 10
acc = []
x = train_data_features
y = np.array(train['label'])
n = len(x)
kf = KFold(n,n_folds,shuffle=False,random_state=None)
for train_index,test_index in kf:
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    forest.fit(x_train,y_train)
    y_pred = forest.predict(x_test)
    acc.append(accuracy_score(y_test,y_pred))
print np.mean(np.array(acc))

Training the random forest...
0.900554510015


In [17]:

# Create an empty list and append the clean reviews one by one
num_reviews = len(test1["review"])
clean_test_reviews = [] 

print "Cleaning and parsing the test set movie reviews...\n"
for i in xrange(0,num_reviews):
    if( (i+1) % 100 == 0 ):
        print "Review %d of %d\n" % (i+1, num_reviews)
    clean_review = review_to_words( test1["review"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

Cleaning and parsing the test set movie reviews...

Review 100 of 179



In [19]:
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

In [21]:
print result

[ 0.  1.  1.  0.  0.  0.  0.  1.  0.  1.  1.  1.  1.  1.  1.  0.  1.  1.
  1.  0.  1.  1.  0.  1.  1.  1.  0.  1.  0.  1.  0.  1.  1.  0.  1.  1.
  1.  0.  0.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.  0.
  0.  1.  0.  0.  1.  1.  1.  0.  1.  0.  1.  0.  1.  0.  1.  1.  1.  1.
  1.  0.  1.  1.  1.  1.  1.  1.  0.  0.  1.  1.  1.  0.  1.  1.  1.  0.
  1.  1.  0.  0.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  0.
  1.  0.  1.  1.  1.  1.  1.  1.  0.  1.  0.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  0.  1.  0.  1.  1.  1.  1.  0.  1.  1.  1.  0.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


In [2]:
import numpy as np
from sklearn.metrics import accuracy_score
y_pred = [0, 2, 1, 3.0]
y_true = [0, 1, 2, 3]
accuracy_score(y_true, y_pred)

0.5