# Setting up notebook

## Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Change the directory

In [None]:
import os
os.chdir('/content/drive/MyDrive/MIS584/')

## Import necessary libraries

In [None]:
import pandas as pd
import seaborn as sns
%pylab inline
pd.options.display.max_seq_items = 2000
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
import requests, re
import pandas as pd
import seaborn as sns
import string, itertools
from collections import Counter, defaultdict
import json
import csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score,\
    precision_score, f1_score, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

Populating the interactive namespace from numpy and matplotlib


# Load Files

## Read Business json file

In [None]:
data_file = open("yelp_academic_dataset_business.json")
data = []
for line in data_file:
  data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()

In [None]:
print("Number of observations in yelp biz dataset: ",business_df.shape[0])
print(" ")
print("Attributes of yelp biz dataset: ")
print(business_df.columns)
print(" ")
print("First 10 observations in yelp biz dataset: ")
print(business_df.head(10))

Number of observations in yelp biz dataset:  150346
 
Attributes of yelp biz dataset: 
Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours'], dtype='object')
 
First 10 observations in yelp biz dataset: 
              business_id                      name                                     address            city state postal_code  latitude  longitude  stars  review_count  is_open                                         attributes                                         categories                                              hours
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ                      1616 Chapala St, Ste 2   Santa Barbara    CA       93101   34.4267  -119.7112 5.0000             7        0                      {'ByAppointmentOnly': 'True'}  Doctors, Traditional Chinese Medicine, Naturop...                                               None
1  mpf3x-Bj

## Read Reviews json file

Filter only year 2021 reviews

In [None]:
data_file = open("yelp_academic_dataset_review.json")
data = []
for line in data_file:
  if json.loads(line)['date'] >= "2021-01-01 00:00:00" and json.loads(line)['date'] < "2022-01-01 00:00:00":
    data.append(json.loads(line))
review_df = pd.DataFrame(data)
data_file.close()

In [None]:
print("Number of observations in yelp review dataset: ",review_df.shape[0])
print(" ")
print("Attributes of yelp review dataset: ")
print(review_df.columns)
print(" ")
print("First 10 observations in review biz dataset: ")
print(review_df.head(10))

Number of observations in yelp review dataset:  618189
 
Attributes of yelp review dataset: 
Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'], dtype='object')
 
First 10 observations in review biz dataset: 
                review_id                 user_id             business_id  stars  useful  funny  cool                                               text                 date
0  iBUJvIOkToh2ZECVNq5PDg  iAD32p6h32eKDVxsPHSRHA  YB26JvvGS2LgkxEKOObSAw 5.0000       0      0     0  I've been eating at this restaurant for over 5...  2021-01-08 01:49:36
1  HgEofz6qEQqKYPT7YLA34w  rYvWv-Ny16b1lMcw1IP7JQ  jfIwOEXcVRyhZjM4ISOh4g 1.0000       0      0     0  How does a delivery person from here get lost ...  2021-01-02 00:19:00
2  milJ7UH4Od9pBe2gWac9tA  v7i4M7NIx3bMNMChaXjU7Q  raKflkp3CANr8N7qpQ3ZyQ 5.0000       0      0     0  I WISH I was still a Sierra resident. They're ...  2021-02-02 18:14:15
3  Kxo5d6EOnOE-vERwQf2a1w  2ntnbUia9Bna62W0fqNcx

# Cleaning



1.   Replace quotation marks with empty space in name and address column('')
2.   Filter only US states
3.   Drop records with na values
4.   Filter all restaurants in the US 




In [None]:
# remove quotation marks in name and address column
business_df.name=business_df.name.str.replace('"','')
business_df.address=business_df.address.str.replace('"','')

# filter restaurants of US
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

usa=business_df.loc[business_df['state'].isin(states)]
usa = usa.dropna()

print("Number of observations in yelp biz dataset: ",business_df.shape[0])
print(" ")
print("Number of observations in yelp USA biz dataset: ",usa.shape[0])
print(" ")
print("Attributes of yelp biz dataset: ")
print(business_df.columns)
print(" ")
print("First 10 observations in yelp USA biz dataset: ")
print(usa.head(10))


Number of observations in yelp biz dataset:  150346
 
Number of observations in yelp USA biz dataset:  113506
 
Attributes of yelp biz dataset: 
Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours'], dtype='object')
 
First 10 observations in yelp USA biz dataset: 
               business_id                      name                                     address           city state postal_code  latitude  longitude  stars  review_count  is_open                                         attributes                                         categories                                              hours
1   mpf3x-BjTdTEA3yCZrAYPw             The UPS Store             87 Grasso Plaza Shopping Center         Affton    MO       63123   38.5511   -90.3357 3.0000            15        1             {'BusinessAcceptsCreditCards': 'True'}  Shipping Centers, Local Services, Notaries, Ma... 

# Filtering

1.  Select all restaurants in USA 
2.  Select 16 cuisine types of restaurants and rename the category
3.  Drop null values in category column
4.  Delete original column categories along with attributes and hours and reset the index

In [None]:
# select all restaurants in USA
us_restaurants=usa[usa['categories'].str.contains('Restaurants')]

# select out 16 cuisine types of restaurants and rename the category
us_restaurants.is_copy=False
us_restaurants.loc[us_restaurants.categories.str.contains('American'),'category'] = 'American'
us_restaurants.loc[us_restaurants.categories.str.contains('Mexican'), 'category'] = 'Mexican'
us_restaurants.loc[us_restaurants.categories.str.contains('Italian'), 'category'] = 'Italian'
us_restaurants.loc[us_restaurants.categories.str.contains('Japanese'), 'category'] = 'Japanese'
us_restaurants.loc[us_restaurants.categories.str.contains('Chinese'), 'category'] = 'Chinese'
us_restaurants.loc[us_restaurants.categories.str.contains('Thai'), 'category'] = 'Thai'
us_restaurants.loc[us_restaurants.categories.str.contains('Mediterranean'), 'category'] = 'Mediterranean'
us_restaurants.loc[us_restaurants.categories.str.contains('French'), 'category'] = 'French'
us_restaurants.loc[us_restaurants.categories.str.contains('Vietnamese'), 'category'] = 'Vietnamese'
us_restaurants.loc[us_restaurants.categories.str.contains('Greek'),'category'] = 'Greek'
us_restaurants.loc[us_restaurants.categories.str.contains('Indian'),'category'] = 'Indian'
us_restaurants.loc[us_restaurants.categories.str.contains('Korean'),'category'] = 'Korean'
us_restaurants.loc[us_restaurants.categories.str.contains('Hawaiian'),'category'] = 'Hawaiian'
us_restaurants.loc[us_restaurants.categories.str.contains('African'),'category'] = 'African'
us_restaurants.loc[us_restaurants.categories.str.contains('Spanish'),'category'] = 'Spanish'
us_restaurants.loc[us_restaurants.categories.str.contains('Middle_eastern'),'category'] = 'Middle_eastern'

# drop null values in category, delete original column categories and reset the index
us_restaurants=us_restaurants.dropna(subset=['category'])
us_restaurants = us_restaurants.drop(['categories', 'attributes', 'hours'], axis=1)
us_restaurants=us_restaurants.reset_index(drop=True)
print("Number of observations in yelp USA restaurants biz dataset: ",us_restaurants.shape[0])
print(" ")
print("First 10 Observations in yelp us restaurants dataset:\n")
print(us_restaurants.head(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Number of observations in yelp USA restaurants biz dataset:  24909
 
First 10 Observations in yelp us restaurants dataset:

              business_id                                 name             address          city state postal_code  latitude  longitude  stars  review_count  is_open    category
0  eEOYSgkmpB90uNA7lDOMRA                Vietnamese Food Truck                         Tampa Bay    FL       33602   27.9553   -82.4563 4.0000            10        1  Vietnamese
1  il_Ro8jwPlHresjw9EGmBg                              Denny's        8901 US 31 S  Indianapolis    IN       46227   39.6371   -86.1272 2.5000            28        1    American
2  0bPLkL0QhhPO5kt1_EXmNQ                 Zio's Italian Market       2575 E Bay Dr         Largo    FL       33771   27.9161   -82.7605 4.5000           100        0     Italian
3  MUTTqe8uqyMdBl186RmNeA                             Tuna Bar         205 Race St  Philadelphia    PA       19106   39.9539   -75.1432 4.0000           245        

# Sanity Checks

In [None]:
# check total number of us restaurants
us_restaurants.shape

(24909, 12)

In [None]:
# check whether has duplicated business id
us_restaurants.business_id.duplicated().sum()

0

In [None]:
# check missing values
us_restaurants.isnull().sum()

business_id     0
name            0
address         0
city            0
state           0
postal_code     0
latitude        0
longitude       0
stars           0
review_count    0
is_open         0
category        0
dtype: int64

In [None]:
# check total number of reviews
review_df.shape

(618189, 9)

In [None]:
# check missing values
review_df.isnull().sum()

review_id      0
user_id        0
business_id    0
stars          0
useful         0
funny          0
cool           0
text           0
date           0
dtype: int64

In [None]:
# check duplicates of review_id
review_df.review_id.duplicated().sum()

0

# Merging

1. Merge business dataframe and review dataframe
2. Update column names
3. Add two new columns - number of words in review and label (Positive, Neutral, Negative)




In [None]:
# merge business table and review table
restaurants_reviews = pd.merge(us_restaurants, review_df, on = 'business_id')

# update column names
restaurants_reviews.rename(columns={'stars_x':'avg_star','stars_y':'review_star'}, inplace=True)

# add column of number of words in review and label of negative and postive reviews
restaurants_reviews['num_words_review'] = restaurants_reviews.text.str.replace('\n',''). \
                                          str.replace('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]','').map(lambda x: len(x.split()))

# label reviews as positive or negative
restaurants_reviews['labels'] = ''
restaurants_reviews.loc[restaurants_reviews.review_star >=4, 'labels'] = 'Positive'
restaurants_reviews.loc[restaurants_reviews.review_star ==3, 'labels'] = 'Neutral'
restaurants_reviews.loc[restaurants_reviews.review_star <3, 'labels'] = 'Negative'

  if __name__ == '__main__':


In [None]:
restaurants_reviews.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,avg_star,review_count,is_open,category,review_id,user_id,review_star,useful,funny,cool,text,date,num_words_review,labels
0,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.9553,-82.4563,4.0,10,1,Vietnamese,6znAMW-mwegBF54aXkfxEg,kd6Rt_K3hIikXH5fIhmn_Q,3.0,0,0,0,I really really wanted to like this place. Th...,2021-10-02 01:23:08,108,Neutral
1,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.9553,-82.4563,4.0,10,1,Vietnamese,CE36Dcxv60ibaHntpm94zg,qjfMBIZpQT9DDtw_BWCopQ,5.0,18,2,14,Vietnamese Food Truck was a super perfect dinn...,2021-12-01 00:06:58,177,Positive
2,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227,39.6371,-86.1272,2.5,28,1,American,i_ErGQkWb9o8Yr59PvJFMw,7ahDVjzGcAcRxdsx4AGUzA,4.0,1,0,1,"Ok I know!! It's DENNY""S!! (""Lenny's! LOL! Sho...",2021-01-29 00:03:33,93,Positive
3,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227,39.6371,-86.1272,2.5,28,1,American,XjYTEIrgqwXdCtc-JMaADQ,WjQBeRnBKgqGIoIsmn37Cg,1.0,0,0,0,If you want bad service and an assistant manag...,2021-04-01 02:45:09,56,Negative
4,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227,39.6371,-86.1272,2.5,28,1,American,hiB10SbNGTG8zgUiwBPoiQ,UBYO9lnBRa6OGRvdekEyZQ,1.0,0,0,0,I went there with my uncle and grandma on Sund...,2021-03-09 09:42:28,80,Negative


#SVM

In [None]:
# drop neutral reviews for easy analysis
restaurants_reviews.drop(restaurants_reviews[restaurants_reviews['labels'] =='Neutral'].index, axis=0, inplace=True)
restaurants_reviews.reset_index(drop=True, inplace=True)

# convert text to lower case
restaurants_reviews.text = restaurants_reviews.text.str.lower()

# remove unnecessary punctuation
restaurants_reviews['removed_punct_text']= restaurants_reviews.text.str.replace('\n',''). \
                                          str.replace('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]','')

  # Remove the CWD from sys.path while we load stuff.


In [None]:
# import positive file which contains common meaningless positive words such as good
file_positive = open('positive.txt')
reader =csv.reader(file_positive)
positive_words = [word[0] for word in reader]

# import negative file which contains common meaningless negative words such as bad
file_negative = open('negative.txt')
reader =csv.reader(file_negative)
negative_words = [word[0] for word in reader]

In [None]:
# only keep positive and negative words
def filter_words(review):
    words = [word for word in review.split() if word in positive_words + negative_words]
    words = ' '.join(words)
    return words

In [None]:
df = restaurants_reviews[['removed_punct_text','labels']]
df.reset_index(drop=True, inplace =True)
df.rename(columns={'removed_punct_text':'text'}, inplace=True)
print(df.head(2))
df.text = df.text.apply(filter_words)


training set size: 194452; test set size: 48613



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [None]:
X = df['text']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=1)

print('\ntraining set size: {}; test set size: {}\n'\
    .format(X_train.shape[0], X_test.shape[0]))


training set size: 194452; test set size: 48613



In [None]:
terms_train=list(X_train)
terms_test=list(X_test)

In [None]:
vectorizer = CountVectorizer()
feature_train_counts=vectorizer.fit_transform(terms_train)
tf_transformer = TfidfTransformer().fit(feature_train_counts)
feature_train_transformed = tf_transformer.transform(feature_train_counts)

feature_test_counts=vectorizer.transform(terms_test)
feature_test_transformed = tf_transformer.transform(feature_test_counts)

labels = LabelEncoder()
y_train_labels_fit = labels.fit(y_train)
y_train_labels_trf = labels.transform(y_train)
y_test_labels_trf = labels.transform(y_test)

print(labels.classes_)
print(y_train_labels_trf)

['Negative' 'Positive']
[0 0 1 ... 1 0 0]


In [None]:
svm = LinearSVC()
clf = svm.fit(feature_train_transformed, y_train_labels_trf)

calibrated_svc = CalibratedClassifierCV(base_estimator = svm,
                                        cv = "prefit")

calibrated_svc.fit(feature_train_transformed, y_train_labels_trf)

CalibratedClassifierCV(base_estimator=LinearSVC(), cv='prefit')

In [None]:
svm_pred = calibrated_svc.predict(feature_test_transformed)
print(svm_pred)


[1 1 1 ... 1 1 0]


In [None]:
print('SVM Confusion Matrix:')
svm_conf_mat = confusion_matrix(y_test_labels_trf, svm_pred)
print(svm_conf_mat)
print()

## accuracy
svm_acc = accuracy_score(y_test_labels_trf, svm_pred)
print('Prediction accuracy: {:.4f}'.format(svm_acc))

## recall
svm_recall = recall_score(y_test_labels_trf, svm_pred)
print('Prediction recall: {:.4f}'.format(svm_recall))

## precision
svm_precision = precision_score(y_test_labels_trf, svm_pred)
print('Prediction precision: {:.4f}'.format(svm_precision))

## F1 score
svm_f1 = f1_score(y_test_labels_trf, svm_pred)
print('Prediction F1: {:.4f}'.format(svm_f1))

SVM Confusion Matrix:
[[10296  1727]
 [ 1321 35269]]

Prediction accuracy: 0.9373
Prediction recall: 0.9639
Prediction precision: 0.9533
Prediction F1: 0.9586
