# Data Exploration

In [1]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('labeledTrainData.tsv',delimiter='\t',header=0,quoting=3)

In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
train.shape

(25000, 3)

In [6]:
train['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [8]:
train['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [21]:
type(train['review'])

pandas.core.series.Series

# Data cleaning

In [11]:
#To remove the HTML tags
#Beautifulsoup is a python library to remove HTML and XML tags

In [10]:
from bs4 import BeautifulSoup

In [18]:
text1 = BeautifulSoup(train['review'][0],"lxml")

In [17]:
text1.get_text()

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 2

In [36]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [44]:
def clean_review( raw_review ):
    #
    # Remove HTML
    review_text = BeautifulSoup(raw_review,"lxml").get_text() 
    #
    # Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    
    #Stemming  using PorterStemmer
    ps = PorterStemmer()
    stemmed_words=[]
    for i in meaningful_words:
        stemmed_words.append(ps.stem(i))
    #
    # Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( stemmed_words ))  

In [40]:
clean_review(train['review'][0])

'stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obviou messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl stay 

In [28]:
reviews_count=len(train['review'])

In [45]:
reviews_new=[]
for i in range(0,reviews_count):
    reviews_new.append(clean_review(train['review'][i]))
    

In [47]:
reviews_new[1]

'classic war world timothi hine entertain film obvious goe great effort length faith recreat h g well classic book mr hine succe watch film appreci fact standard predict hollywood fare come everi year e g spielberg version tom cruis slightest resembl book obvious everyon look differ thing movi envis amateur critic look critic everyth other rate movi import base like entertain peopl never agre critic enjoy effort mr hine put faith h g well classic novel found entertain made easi overlook critic perceiv shortcom'

# Creating features from bag of words

In [48]:
#We need to convert the reviews text into some numerical format suitable for machine learning
#Using feature extraction module from Sklearn to create bag of words model
#CountVectorizer converts a collection of text documents to a matrix of token counts

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

In [50]:
vectormodel = CountVectorizer(analyzer='word',tokenizer=None,preprocessor=None,stop_words=None,max_features=5000)

In [51]:
# fit_transform() does two functions: First, it fits the model and learns the vocabulary; 
# second, it transforms our training data into feature vectors. The input to fit_transform should be a list of strings.

In [52]:
train_features = vectormodel.fit_transform(reviews_new)

In [53]:
#Converting the features model into numpy array
train_features = train_features.toarray()

In [54]:
train_features.shape

(25000, 5000)

In [56]:
#To see the vocabulary
vectormodel.get_feature_names()

['abandon',
 'abc',
 'abil',
 'abl',
 'abomin',
 'aborigin',
 'abort',
 'abound',
 'abraham',
 'abrupt',
 'abruptli',
 'absenc',
 'absent',
 'absolut',
 'absorb',
 'absurd',
 'abund',
 'abus',
 'abysm',
 'academi',
 'accent',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'accord',
 'account',
 'accur',
 'accuraci',
 'accus',
 'ace',
 'achiev',
 'acid',
 'acknowledg',
 'acquaint',
 'acquir',
 'across',
 'act',
 'action',
 'activ',
 'actor',
 'actress',
 'actual',
 'ad',
 'adam',
 'adapt',
 'add',
 'addict',
 'addit',
 'address',
 'adequ',
 'admir',
 'admit',
 'admittedli',
 'adolesc',
 'adopt',
 'ador',
 'adult',
 'advanc',
 'advantag',
 'adventur',
 'advertis',
 'advic',
 'advis',
 'aesthet',
 'affair',
 'affect',
 'affleck',
 'afford',
 'aforement',
 'afraid',
 'africa',
 'african',
 'afternoon',
 'afterward',
 'age',
 'agenc',
 'agenda',
 'agent',
 'aggress',
 'ago',
 'agre',
 'ah',
 'ahead',
 'aid',
 'aim',
 'air',
 'airplan',
 'airport',
 'a

# Creating model using Random forest

In [57]:
from sklearn.ensemble import RandomForestClassifier

In [58]:
forest = RandomForestClassifier(n_estimators=100)
forestmodel = forest.fit(train_features,train['sentiment'])

In [59]:
#RUNNING THE MODEL ON TEST DATA

In [60]:
test = pd.read_csv('testData.tsv',delimiter='\t',header=0,quoting=3)

In [61]:
test.shape

(25000, 2)

In [62]:
test_new = []
for i in range(0,reviews_count):
    test_new.append(clean_review(test['review'][i]))

In [64]:
#Getting bag of words of test
test_features = vectormodel.transform(test_new)
test_features = test_features.toarray()

In [65]:
#Using random forest to predict
result=forest.predict(test_features)

In [66]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model1.csv", index=False, quoting=3 )