In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import nltk
%matplotlib inline

In [2]:
data = pd.read_csv("reviews.csv") #import dataset

In [3]:
data.head() #look at the last five reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,1178162,4724140,2013-05-21,4298113,Olivier,My stay at islam's place was really cool! Good...
1,1178162,4869189,2013-05-29,6452964,Charlotte,Great location for both airport and city - gre...
2,1178162,5003196,2013-06-06,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...
3,1178162,5150351,2013-06-15,2215611,Marine,The room was nice and clean and so were the co...
4,1178162,5171140,2013-06-16,6848427,Andrew,Great location. Just 5 mins walk from the Airp...


In [4]:
#list all the column headers:
for i in data.columns:
    print(i)

listing_id
id
date
reviewer_id
reviewer_name
comments


In [5]:
len(data) #total number of reviews

68275

In [6]:
len(data.reviewer_name.unique()) #total number of hotels being reviewed in this dataset

14775

Natural Language Processing - Tokenize the reviews and build a bag-of-words model

The first goal is to do sentiment analysis on the positive and negative reviews. To do this, first tokenize the words using nltk, remove the stopwords, and build a bag-of-words model.

In [7]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\Isaac
[nltk_data]     Sadikin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
reviews=data.comments
print(type(reviews))

<class 'pandas.core.series.Series'>


In [10]:
reviews_words = nltk.word_tokenize(reviews[1]) #word_tokenize only works for text file, not whole series

print(reviews_words) 

['Great', 'location', 'for', 'both', 'airport', 'and', 'city', '-', 'great', 'amenities', 'in', 'the', 'house', ':', 'Plus', 'Islam', 'was', 'always', 'very', 'helpful', 'even', 'though', 'he', 'was', 'away']


In [11]:
print(type(reviews[:5]))

<class 'pandas.core.series.Series'>


In [12]:
len(reviews)

68275

In [14]:
reviews_wordslist = []  
#for i in range(5):
for i in range(68275): 
    reviews_wordslist.append(nltk.word_tokenize(reviews.iloc[i])) 

TypeError: expected string or bytes-like object

In [15]:
print(reviews_wordslist[:5])

[['My', 'stay', 'at', 'islam', "'s", 'place', 'was', 'really', 'cool', '!', 'Good', 'location', ',', '5min', 'away', 'from', 'subway', ',', 'then', '10min', 'from', 'downtown', '.', 'The', 'room', 'was', 'nice', ',', 'all', 'place', 'was', 'clean', '.', 'Islam', 'managed', 'pretty', 'well', 'our', 'arrival', ',', 'even', 'if', 'it', 'was', 'last', 'minute', ';', ')', 'i', 'do', 'recommand', 'this', 'place', 'to', 'any', 'airbnb', 'user', ':', ')'], ['Great', 'location', 'for', 'both', 'airport', 'and', 'city', '-', 'great', 'amenities', 'in', 'the', 'house', ':', 'Plus', 'Islam', 'was', 'always', 'very', 'helpful', 'even', 'though', 'he', 'was', 'away'], ['We', 'really', 'enjoyed', 'our', 'stay', 'at', 'Islams', 'house', '.', 'From', 'the', 'outside', 'the', 'house', 'did', "n't", 'look', 'so', 'inviting', 'but', 'the', 'inside', 'was', 'very', 'nice', '!', 'Even', 'though', 'Islam', 'himself', 'was', 'not', 'there', 'everything', 'was', 'prepared', 'for', 'our', 'arrival', '.', 'The',

In [16]:
len(reviews_wordslist)

1077

In [17]:
type(reviews_wordslist)

list

In [18]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Isaac
[nltk_data]     Sadikin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
len(nltk.corpus.stopwords.words("english")) #all the reviews in this dataset are in English

179

In [20]:
nltk.corpus.stopwords.words("english")[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [21]:
useless_words = nltk.corpus.stopwords.words("english")
type(useless_words)

list

In [22]:
def build_bag_of_words_filtered(words):
    return {
        #word:1 for word in words
        word:1 for word in words \
        if not word in useless_words}

In [23]:
assert len(build_bag_of_words_filtered(["what", "the"]))==0, "Make sure  filtering out stopwords"

In [24]:
positive_features = None
positive_features = [
    (build_bag_of_words_filtered(review),'pos') \
    for review in reviews_wordslist 
]

In [25]:
positive_features[-1]

({'The': 1,
  'location': 1,
  'apartment': 1,
  'great': 1,
  'short': 1,
  'walk': 1,
  'city': 1,
  '.': 1,
  'comforts': 1,
  'home': 1,
  'Saw': 1,
  'George': 1,
  '2': 1,
  'minutes': 1,
  'Would': 1,
  'go': 1,
  'back': 1,
  'Nice': 1,
  'view': 1,
  'harbor': 1,
  '!': 1},
 'pos')

In [26]:
positive_features[-3]

({'George': 1,
  'great': 1,
  'host': 1,
  'around': 1,
  ',': 1,
  'apartment': 1,
  'wonderful': 1,
  'views': 1,
  '.': 1,
  'Would': 1,
  'highly': 1,
  'recommend': 1,
  '!': 1},
 'pos')

In [27]:
type(positive_features)

list

In [28]:
negative_features = None
negative_features = [
    (build_bag_of_words_filtered(review),'neg') \
    for review in reviews_wordslist 
]

In [29]:
negative_features[-2:]

[({'I': 1,
   'recommend': 1,
   'without': 1,
   'hesitation': 1,
   'one': 1,
   'best': 1,
   'experiences': 1,
   'travel': 1,
   '.': 1,
   "'m": 1,
   'looking': 1,
   'forward': 1,
   'returning': 1,
   'George': 1,
   'made': 1,
   'stay': 1,
   'pleasant': 1,
   'comfortable': 1},
  'neg'),
 ({'The': 1,
   'location': 1,
   'apartment': 1,
   'great': 1,
   'short': 1,
   'walk': 1,
   'city': 1,
   '.': 1,
   'comforts': 1,
   'home': 1,
   'Saw': 1,
   'George': 1,
   '2': 1,
   'minutes': 1,
   'Would': 1,
   'go': 1,
   'back': 1,
   'Nice': 1,
   'view': 1,
   'harbor': 1,
   '!': 1},
  'neg')]

Train a classifier for sentiment analysis

use the Naive Bayes classifier ; train it on 80 percent of the data, and test on the remaining 20 percent

In [30]:
from nltk.classify import NaiveBayesClassifier

In [31]:
#Using 80% of the data for training, the rest for validation:
split = int(len(positive_features) * 0.8)
split

861

In [32]:
classifier = NaiveBayesClassifier.train(positive_features[:split]+negative_features[:split])

#check the accuracy on the training and test sets, turning accuracy into percentage:

In [33]:
training_accuracy = None #check accuracy of training set
training_accuracy = nltk.classify.util.accuracy(classifier, positive_features[:split] + negative_features[:split])*100
training_accuracy

50.0

In [34]:
test_accuracy = None #check accuracy of test set
test_accuracy = nltk.classify.util.accuracy(classifier, positive_features[split:] + negative_features[split:])*100
test_accuracy

50.0

In [35]:
classifier.show_most_informative_features()

Most Informative Features
                received = None              neg : pos    =      1.0 : 1.0
                interest = None              neg : pos    =      1.0 : 1.0
                     sin = None              neg : pos    =      1.0 : 1.0
                       u = None              neg : pos    =      1.0 : 1.0
                  laying = 1                 neg : pos    =      1.0 : 1.0
                reaching = 1                 neg : pos    =      1.0 : 1.0
                  basics = None              neg : pos    =      1.0 : 1.0
                 总体来说挺好的 = 1                 neg : pos    =      1.0 : 1.0
              atenciones = None              neg : pos    =      1.0 : 1.0
               expecting = 1                 neg : pos    =      1.0 : 1.0


In [36]:
classifier.show_most_informative_features(50)

Most Informative Features
                received = None              neg : pos    =      1.0 : 1.0
                interest = None              neg : pos    =      1.0 : 1.0
                     sin = None              neg : pos    =      1.0 : 1.0
                       u = None              neg : pos    =      1.0 : 1.0
                  laying = 1                 neg : pos    =      1.0 : 1.0
                reaching = 1                 neg : pos    =      1.0 : 1.0
                  basics = None              neg : pos    =      1.0 : 1.0
                 总体来说挺好的 = 1                 neg : pos    =      1.0 : 1.0
              atenciones = None              neg : pos    =      1.0 : 1.0
               expecting = 1                 neg : pos    =      1.0 : 1.0
                      Be = 1                 neg : pos    =      1.0 : 1.0
                  Bodton = None              neg : pos    =      1.0 : 1.0
                   Small = None              neg : pos    =      1.0 : 1.0