In [19]:
import os.path as path
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

DIR = path.join('data')
ltrain = path.join(DIR, 'labeledTrainData.tsv')
stop = set(stopwords.words('english'))

train = pd.read_csv(ltrain, header=0, delimiter='\t', quoting=3)

def review_to_words(raw_review):
    text = BeautifulSoup(raw_review).text
    letters_only = re.sub(r'[^a-zA-Z]', ' ', text)
    words = letters_only.lower().split()
    words = [w for w in words if not w in stop]
    return ' '.join(words)

num_reviews = len(train['review'])
reviews = []

for i in range(0, num_reviews):
    reviews.append(review_to_words(train['review'][i]))

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer= 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
train_features = vectorizer.fit_transform(reviews)
train_features = train_features.toarray()
print(train_features.shape)

(25000, 5000)


In [25]:
vocab = vectorizer.get_feature_names()

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_features, train['sentiment'])

In [26]:
ltest = path.join(DIR, 'testData.tsv')
test = pd.read_csv(ltest, header=0, delimiter='\t', quoting=3)

num_test = len(test['review'])
tests = []

for i in range(0, num_reviews):
    tests.append(review_to_words(test['review'][i]))
    
test_features = vectorizer.transform(tests)
test_features = test_features.toarray()

result = forest.predict(test_features)

output = pd.DataFrame(data={'id':test['id'], 'sentiment': result})
output.to_csv('BoW_Model.csv', index=False, quoting=3)