In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd  
from bs4 import BeautifulSoup
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.ensemble import RandomForestClassifier

stopwords = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

In [2]:
train = pd.read_csv("./Kaggle/Bag of Words/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
print train.shape
print train.columns.values

(25000, 3)
['id' 'sentiment' 'review']


In [3]:
# defines a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    stems = [stemmer.stem(t) for t in tokens]
    return stems

In [4]:
def preprocess_review( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case
    return letters_only.lower()

In [5]:
def get_tfidf_features(number_features = 5000):
    print "Creating the bag of words...\n"
    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.  
    vectorizer = TfidfVectorizer(tokenizer = tokenize_and_stem,    \
                                 preprocessor = preprocess_review, \
                                 stop_words = stopwords,   \
                                 max_features = number_features, \
                                 ngram_range=(1,3), \
                                 use_idf=True) 

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(train['review'])
    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    return train_data_features.toarray(), vectorizer

In [6]:
def train_random_forest(number_features = 5000):
    # Get the features
    train_data_features, vectorizer = get_tfidf_features(number_features=number_features)
    #
    print "Training the random forest..."
    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators = 100) 
    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    return forest.fit( train_data_features, train["sentiment"] ), vectorizer

In [9]:
def predict_test_data(number_features = 5000):
    # Train the Random Forest model
    forest, vectorizer = train_random_forest(number_features=number_features)
    # Read the test data
    test = pd.read_csv("Kaggle/Bag of Words/testData.tsv", header=0, delimiter="\t", \
                       quoting=3 )
    # Get tf-idf features for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(test["review"])
    test_data_features = test_data_features.toarray()

    # Use the random forest to make sentiment label predictions
    result = forest.predict(test_data_features)

    # Copy the results to a pandas dataframe with an "id" column and
    # a "sentiment" column
    output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

    # Use pandas to write the comma-separated output file
    output.to_csv( "Kaggle/Bag of Words/Bag_of_Words_model_submission5_" + str(number_features) + ".csv", index=False, quoting=3 )

In [10]:
for i in range(1, 11):
    predict_test_data(number_features=(i * 1000))

Creating the bag of words...

Training the random forest...
Creating the bag of words...

Training the random forest...
Creating the bag of words...

Training the random forest...
Creating the bag of words...

Training the random forest...
Creating the bag of words...

Training the random forest...
Creating the bag of words...

Training the random forest...
Creating the bag of words...

Training the random forest...
Creating the bag of words...

Training the random forest...
Creating the bag of words...

Training the random forest...


In [13]:
predict_test_data(number_features=10000)

Creating the bag of words...

Training the random forest...
