In [1]:
# !ipython nbconvert --to=python KaggleWord2VecUtility.ipynb

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os
# This have a fast learning curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
# Helper class to clean the data
from KaggleWord2VecUtility import KaggleWord2VecUtility
# To remove unnecessary words from the dataset
import nltk
# importing the stop words list
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re

from sklearn.model_selection import train_test_split

In [3]:
# print(os.environ)

In [4]:
combined = pd.read_csv("Combined1.csv")

In [5]:
train = pd.DataFrame(combined)

In [6]:
train.head(5)

Unnamed: 0,Sentences,Labels
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [7]:
# Shape of the trainng dataset
train.shape

(1994, 2)

In [8]:
# looking into the column values
train.columns.values

array(['Sentences', 'Labels'], dtype=object)

In [9]:
# printing out the first sentence 
print(train["Sentences"][0])

Wow... Loved this place.


## Trying out on one sentence

In [10]:
# Initialize the BeautifulSoup object on a single movie sentence     
example1 = BeautifulSoup(train["Sentences"][0], "lxml")  

In [11]:
# Print the raw sentence and then the output of get_text(), for 
# comparison
print(train["Sentences"][0])
print(example1.get_text())

Wow... Loved this place.
Wow... Loved this place.


In [12]:
import re
# Use regular expressions to do a find-and-replace
# IN re.sub -->> The pattern to search for, # The pattern to replace it with, # The text to search
letters_only = re.sub("[^a-zA-Z]", " ", example1.get_text())  
print(letters_only)

Wow    Loved this place 


In [13]:
lower_case = letters_only.lower()        # Convert to lower case
print(lower_case)
words = lower_case.split()               # Split into words
print(words)

wow    loved this place 
['wow', 'loved', 'this', 'place']


## Cleaning the data 

In [14]:
# Downloading text datasets including stopwords
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [15]:
# stop words
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [16]:
# Remove stop words from "words"
words = [w for w in words if not w in stopwords.words("english")]
print(words)

['wow', 'loved', 'place']


In [17]:
Sentences = train["Sentences"]

In [18]:
def Sentences_to_words(Sentences):
    # Function to convert a raw sentence to a string of words
    # The input is a single string (a raw sentence), and 
    # the output is a single string (a preprocessed sentence)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(Sentences, "lxml").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [19]:
# checking
clean_sentence = Sentences_to_words(train["Sentences"][0])
print(clean_sentence)

wow loved place


In [20]:
# Get the number of sentence based on the dataframe column size
num_Sentences = train["Sentences"].size

In [21]:
# Initialize an empty list to hold the clean sentence
clean_train_Sentences = []

# Loop over each sentence; create an index i that goes from 0 to the length
# of the sentence list 
for i in range( 0, num_Sentences ):
    # Call our function for each one, and add the result to the list of
    # clean sentence
    clean_train_Sentences.append( Sentences_to_words( train["Sentences"][i] ) )

In [22]:
# print("Cleaning and parsing the training set Sentencess...\n")
# clean_train_Sentences = []
# for i in range( 0, num_Sentences ):
#     # If the index is evenly divisible by 1000, print a message
#     if( (i+1)%1000 == 0 ):
#         print("sentence %d of %d\n" % ( i+1, num_Sentences ))                                                                    
#     clean_train_Sentences.append( Sentences_to_words( train["Sentences"][i] ))

Cleaning and parsing the training set Sentencess...

Review 1000 of 1994



In [23]:
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_Sentences)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [24]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
print(train_data_features.shape)

(1994, 4067)


In [31]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print(vocab)



In [30]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print(count, tag)

1 aailiyah
1 abandoned
3 ability
1 abroad
1 absolute
17 absolutely
1 absolutley
1 abstruse
2 abysmal
1 academy
1 accents
1 accessible
1 accident
1 acclaimed
1 accolades
1 accommodations
1 accomodate
1 accountant
1 accurate
1 accurately
2 accused
1 ache
2 achievement
1 achille
1 ackerman
2 acknowledged
1 across
2 act
3 acted
43 acting
7 action
1 actions
10 actor
19 actors
3 actress
3 actresses
1 actual
12 actually
1 adams
2 adaptation
1 add
3 added
3 addition
1 admins
1 admiration
1 admitted
4 adorable
1 adrift
1 adventure
2 advise
3 aerial
1 aesthetically
1 affected
1 affleck
1 affordable
2 afraid
2 africa
2 afternoon
4 age
1 aged
1 ages
5 ago
1 agree
1 agreed
1 ahead
1 aimless
2 air
1 aired
1 airline
1 airport
1 akasha
1 akin
1 ala
1 albondigas
1 alert
2 alexander
1 alike
1 allergy
1 allison
1 allow
1 allowing
1 almonds
11 almost
1 alone
3 along
1 alongside
4 already
52 also
6 although
19 always
1 amateurish
1 amaze
1 amazed
33 amazing
1 amazingly
7 ambiance
3 ambience
2 america
2 ame

3 delish
2 deliver
1 delivered
2 delivering
3 delivers
1 delivery
1 denny
1 dependant
1 depending
1 depends
1 depicted
2 depicts
2 depressing
4 depth
1 derivative
2 describe
1 describes
1 describing
1 descriptions
1 desert
2 deserved
5 deserves
2 deserving
1 design
1 designed
1 designer
1 desired
1 desperately
1 desperation
1 despicable
1 despised
3 despite
5 dessert
3 desserts
1 destroy
1 detailing
1 details
1 deuchebaggery
1 develop
1 development
1 developments
1 devine
1 di
1 diabetic
3 dialog
1 dialogs
13 dialogue
1 diaper
1 dickens
2 die
2 difference
8 different
1 dignity
1 dime
1 dimensional
2 dine
8 dining
6 dinner
1 dinners
1 dipping
1 direct
3 directed
9 directing
5 direction
12 director
2 directorial
2 directors
2 dirt
4 dirty
1 disagree
2 disappoint
25 disappointed
7 disappointing
6 disappointment
1 disapppointment
3 disaster
2 disbelief
1 discomfort
1 discount
2 discovering
1 discovery
2 disgrace
1 disgraceful
1 disgust
1 disgusted
3 disgusting
8 dish
9 dishes
2 dislike
2 d

94 like
13 liked
3 likes
1 liking
1 lil
1 lilli
1 lilt
1 limitations
2 limited
1 linda
12 line
1 linear
1 lined
7 lines
1 lino
3 lion
5 list
1 listed
5 literally
1 littered
27 little
4 live
2 lived
2 lives
5 living
2 loads
3 lobster
1 local
1 located
7 location
1 locations
1 loewenhielm
1 logic
1 london
2 loneliness
13 long
3 longer
23 look
6 looked
5 looking
4 looks
1 loose
1 loosely
2 lord
1 lordy
2 los
2 losing
6 lost
16 lot
5 lots
1 loudly
2 lousy
1 lovable
41 love
20 loved
6 lovely
1 lover
2 lovers
2 loves
1 loving
7 low
1 lower
1 lox
1 loyal
1 loyalty
1 lucio
1 luck
2 lucy
1 lugosi
1 luke
2 lukewarm
10 lunch
1 lust
1 luv
1 lyrics
1 mac
1 macarons
2 macbeth
2 machine
2 mad
44 made
1 madhouse
1 madison
1 magazine
1 magic
1 magnificent
4 main
1 maine
1 mainly
1 mains
1 maintaining
1 major
30 make
1 maker
1 makers
8 makes
7 making
2 male
1 males
2 mall
1 malta
14 man
2 managed
6 management
4 manager
1 manages
1 mandalay
1 mango
1 manna
1 mansonites
23 many
1 marbles
1 march
2 margari

5 seems
25 seen
10 selection
3 selections
8 self
1 sells
1 semi
1 send
1 senior
4 sense
2 senses
1 sensibility
1 sensitivities
2 sentiment
1 seperate
2 sequel
1 sequels
2 sequence
1 sequences
1 sergeant
8 series
4 serious
10 seriously
1 serivce
2 serve
7 served
13 server
5 servers
2 serves
85 service
2 services
2 serving
5 set
4 sets
5 setting
1 settings
1 seuss
1 sever
5 several
1 sewer
1 sex
1 sexy
2 shakespear
1 shakespears
1 shall
1 shallow
2 shame
1 shameful
3 share
1 sharing
2 sharply
1 shatner
1 shattered
1 shawarrrrrrma
2 shed
1 sheer
1 shelf
1 shell
1 shelves
1 shenanigans
1 shepard
1 shined
1 shirley
1 shirt
1 shocked
1 shocking
1 shoe
1 shooting
1 shoots
1 shop
1 shopping
1 shops
12 short
1 shortlist
5 shot
4 shots
18 show
1 showcasing
4 showed
2 shower
3 shows
6 shrimp
1 shut
1 sibling
7 sick
8 side
1 sidelined
3 sides
2 sign
3 significant
1 signs
5 silent
1 silently
1 silly
2 similar
1 similarly
1 simmering
2 simple
1 simplifying
12 simply
13 since
1 sincere
1 sing
3 singi

In [33]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["Labels"] )

Training the random forest...


In [34]:
forest.score(train_data_features, train["Labels"])

0.9979939819458375

In [75]:
# # Testing the data

# Read the test data
test = pd.read_csv("rt-polarityneg.csv")

# Verify that there are 25,000 rows and 2 columns
print(test.shape)

# Create an empty list and append the clean sentence one by one
num_sentences = len(test["Sentences"])
clean_test_sentence = [] 

print("Cleaning and parsing the test set sentences...\n")
for i in range(0,num_sentences):
    if( (i+1) % 1000 == 0 ):
        print("Review %d of %d\n" % (i+1, num_sentences))
    clean_sentence = Sentences_to_words( test["Sentences"][i] )
    clean_test_sentence.append( clean_sentence )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_sentence)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
# output = pd.DataFrame( data={"Sentences":test["Sentences"], "Labels":result} )

# Use pandas to write the comma-separated output file
# output.to_csv( "Bag_of_Words_model_tryout.csv", index=False, quoting=3)

(5332, 1)
Cleaning and parsing the test set sentences...

Review 1000 of 5332

Review 2000 of 5332

Review 3000 of 5332

Review 4000 of 5332

Review 5000 of 5332



In [58]:
df_result_main = pd.DataFrame()
df_result_main["Sentences"] = test["Sentences"]
df_result_main["Labels"] = result

In [60]:
df_result_main.head()

Unnamed: 0,Sentences,Labels
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",1
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [73]:
forest.score(test_data_features, df_result_main["Labels"])

1.0