In [1]:
import json
import csv
import pandas as pd
import numpy as np
import scipy as sci
from scipy import sparse
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score

In [2]:
#Open business.json file, create tsv file with business_id, business name, categories, and review count to be used as features 
#and stars as label

outfile = open("business.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'stars', 'review_count'])
with open('yelp_dataset/yelp_academic_dataset_business.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        sfile.writerow([row['business_id'], row['categories'], row['stars'],row['review_count'] ])

outfile.close()

business_df= pd.read_csv('business.tsv', delimiter ="\t")

In [3]:
#Open review.json file, create tsv file with business_id,text to be used as features 
#and stars as label

outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])
with open('yelp_dataset/yelp_academic_dataset_review.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])

outfile.close()

review_df= pd.read_csv('review_stars.tsv', delimiter ="\t")

KeyboardInterrupt: 

In [None]:
# Group all reviews by business_id
review_agg_df = review_df.groupby('business_id')['text'].sum()
df_ready_for_sklearn = pd.DataFrame({'business_id': review_agg_df.index, 'all_reviews': review_agg_df.values})


In [None]:
#Merge the resulting review aggregate dataframe with business dataframe
merge_df = pd.merge(business_df, df_ready_for_sklearn, on='business_id')


In [None]:
#print the first 5 rows using head
merge_df.head()

In [None]:
#Normalization of review count field so it becomes comparable and remove bias
merge_df.insert(3,'normalized_count',((merge_df['review_count'] - merge_df['review_count'].min()) / (merge_df['review_count'].max() - merge_df['review_count'].min())).astype(float))
merge_df.drop('review_count', axis=1, inplace=True)

In [None]:
#print the normalized counts for first 5 rows
merge_df.head()

In [None]:
#TF-IDF calculation

tfidf = sk_text.TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

In [None]:
# Fit the reviews column with TFIDFvectorizer
matrix = tfidf.fit_transform(merge_df['all_reviews'])
matrix = matrix.toarray()

In [None]:
# We are adding the normalized count to the original matrix with TFIDFvectorizer
x_matrix=np.column_stack((matrix, merge_df['normalized_count']))

In [None]:
#train test data for linear regression

x_train, x_test, y_train, y_test = train_test_split(x_matrix, merge_df['stars'] , test_size=0.2, random_state=42)

In [None]:
# Checking the shape of train and test data
x_train.shape

In [None]:
x_test.shape


In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
# linear regression

lin_reg_model = LinearRegression()

lin_reg_model.fit(x_train, y_train)

y_pred = lin_reg_model.predict(x_test)

y_pred

In [None]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test[idx], y_pred[i]))

In [None]:
#Performance of Linear Regression Model
# The lower the mean squared the better performance
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
print('R2 score: %.2f' % r2_score(y_test, y_pred))

In [None]:
#stars need to be encoded for all the models we are going to train
le = preprocessing.LabelEncoder()
merge_df['encoded_stars'] = le.fit_transform(merge_df['stars'])

In [None]:
#Display encoded stars
merge_df.head()

In [None]:
# train test dat afor other models

x_train1, x_test1, y_train1, y_test1 = train_test_split(x_matrix, merge_df['encoded_stars'] , test_size=0.2, random_state=42)

In [None]:
# logistic Regression

Log_reg_model = LogisticRegression()

Log_reg_model.fit(x_train1, y_train1)

y_pred1 = Log_reg_model.predict(x_test1)

y_pred1

In [None]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test1.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test1[idx], y_pred1[i]))

In [None]:
#Performance of logistic Regression Model
# The lower the mean squared the better performance
print("Mean squared error: %.2f"
      % mean_squared_error(y_test1, y_pred1))
print('R2 score: %.2f' % r2_score(y_test1, y_pred1))

In [None]:
#implementing Nearest Neighbor

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=9)

knn.fit(x_train1, y_train1) 

y_pred1 = knn.predict(x_test1)

y_pred1 

In [None]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test1.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test1[idx], y_pred1[i]))

In [None]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1))

In [None]:
# SVM

svm_model = SVC(kernel="linear")

svm_model.fit(x_train1, y_train1)

y_pred1 = svm_model.predict(x_test1)

y_pred1

In [None]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test1.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test1[idx], y_pred1[i]))

In [None]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1))

In [None]:
# MNB

mnb_model = MultinomialNB()

mnb_model.fit(x_train1, y_train1)

y_pred1 = mnb_model.predict(x_test1)

y_pred1

In [None]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test1.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test1[idx], y_pred1[i]))

In [None]:
print("accuracy: %.2f"
      % accuracy_score(y_test1, y_pred1))

** Additional Features **

** POSTAL CODE **

In [None]:
# creating new TSV for business dataset adding postal code as an additional feature

outfile = open("business_postal.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'stars', 'review_count','postal code'])
with open('yelp_dataset/yelp_academic_dataset_business.json') as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['categories'], row['stars'],row['review_count'], row['postal_code']])

outfile.close()

In [None]:
# Creating new dataframe

business_postal_df= pd.read_csv('business_nei.tsv', delimiter ="\t")

In [None]:
# merging the new dataframe with the reviews
merge_postal_df = pd.merge(business_postal_df, df_ready_for_sklearn, on='business_id')

In [None]:
# one-hot cooding of postal codes 

hotcoded_df = pd.get_dummies(merge_postal_df['postal code'], sparse = 'true')

In [None]:
# printing the One Hot Coded postal codes

hotcoded_df.head()

In [None]:
# merging the coded postal codes with the main dataframe

result_df = pd.concat([merge_postal_df, hotcoded_df], axis=1, sort=False)

In [None]:
# merge the encoded postal codes with the matrix of TFIDF and Review count

x_matrix=np.column_stack((x_matrix,hotcoded_df))

In [None]:
# Spliting the new data set

x_train2, x_test2, y_train2, y_test2 = train_test_split(x_matrix, merge_df['encoded_stars'] , test_size=0.2, random_state=42)

In [None]:
# MNB

mnb_model = MultinomialNB()

mnb_model.fit(x_train2, y_train2)

y_pred2 = mnb_model.predict(x_test2)

y_pred2

In [None]:
print("accuracy: %.2f"
      % accuracy_score(y_test2, y_pred2))

In [None]:
print("accuracy: %.2f"
      % accuracy_score(y_test2, y_pred2))

** CATEGORIES **

In [None]:
# Preparing category field for one hot coding

merge_df['categories'].str.split(',')

In [None]:
# Extraction the word from a list for a given position

def get_element(my_list, position):
   return my_list[position]

In [None]:
# calling the get_element function for each row of dataframe for extecting categories  

merge_df['categories'] = merge_df['categories'].apply(get_element, position=0)

In [None]:
# one hot coding of categories

hotcoded_df = pd.get_dummies(merge_df['categories'], sparse = 'true')

In [None]:
# merge the one hot coded categories with the matrix of TFIDF and Review count

x_matrix = np.column_stack((x_matrix, hotcoded_df))

In [None]:
# Spliting the new data set

x_train3, x_test3, y_train3, y_test3 = train_test_split(x_matrix, merge_df['encoded_stars'] , test_size=0.2, random_state=42)

In [None]:
# MNB

mnb_model = MultinomialNB()

mnb_model.fit(x_train3, y_train3)

y_pred3 = mnb_model.predict(x_test3)

y_pred3

In [None]:
print("accuracy: %.2f"
      % accuracy_score(y_test3, y_pred3))