In [1]:
#Random Forest
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
from sqlalchemy import create_engine
import numpy as np

In [2]:
# The cleaned data is loaded into postgres database. It is also formatted in the format required for ML during transformation process.
# We are trying random forest classifier as the data will be divided into smaller sets and prediction could be near to accuracy
# Based on the line identified, the output variable will be predicted for the input vairable
# Currently the accuracy rate is coming out to be 25%. We will need to adjust our input parameters in next session to improve on this.

In [3]:
#Pull data from busiensses table from postgres
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost/Yelp_db')

In [4]:
from sqlalchemy.orm import Session
session = Session(engine)

In [5]:
#reviewsDF = pd.read_sql('select stars,city,state,postal_code,category from reviews r, businesses b where b.business_id = r.business_id and length(postal_code)>0',engine)
reviewsDF = pd.read_sql ('select count(r.review_star) review_count, r.review_star stars, b.city, b.postal_code, r.ethnic_type from business_reviews r, business_info b where b.business_id = r.business_id and length(b.postal_code)>0 group by b.city, b.postal_code, r.ethnic_type, r.review_star order by b.postal_code',engine)

In [6]:
XInput =reviewsDF
y = reviewsDF.stars
yDF=round(pd.DataFrame(y))
X = reviewsDF

In [7]:
categoryCountsX=X.ethnic_type.value_counts()
categoryCountsX

American         467
Mexican          437
Italian          377
Chinese          360
Japanese         284
Asian_Fusion     263
Thai             226
Mediterranean    217
Greek            182
Hawaiian         176
Vietnamese       150
Indian           124
French            85
Korean            84
Spanish           57
British           50
African           29
Name: ethnic_type, dtype: int64

In [8]:
replace_type=list(categoryCountsX[categoryCountsX<50].index)

In [9]:
for application in replace_type:
    X.ethnic_type =  X.ethnic_type.replace(application,"Others")
X.head()   

Unnamed: 0,review_count,stars,city,postal_code,ethnic_type
0,1,3,Phoenix,85001,American
1,1,4,Phoenix,85001,American
2,3,5,Phoenix,85001,American
3,1,3,Phoenix,85001,Italian
4,3,4,Phoenix,85001,Italian


In [10]:
# Generate our categorical variable list
reviewCatX = X.dtypes[X.dtypes == "object"].index.tolist()
X[reviewCatX].nunique()

city             2
postal_code    108
ethnic_type     17
dtype: int64

In [11]:
yDF['stars'] = yDF['stars'].astype(str).replace('\.0', '', regex=True)

In [12]:
# Generate our categorical variable list
reviewCaty = yDF.dtypes[yDF.dtypes == "object"].index.tolist()
yDF[reviewCaty].nunique()

stars    5
dtype: int64

In [13]:
X

Unnamed: 0,review_count,stars,city,postal_code,ethnic_type
0,1,3,Phoenix,85001,American
1,1,4,Phoenix,85001,American
2,3,5,Phoenix,85001,American
3,1,3,Phoenix,85001,Italian
4,3,4,Phoenix,85001,Italian
...,...,...,...,...,...
3563,239,4,Las Vegas,89183,Thai
3564,488,5,Las Vegas,89183,Thai
3565,3,1,Las Vegas,93013,American
3566,3,3,Las Vegas,93013,American


In [14]:
predictInputDF = pd.DataFrame(X.groupby(['stars','postal_code','city','ethnic_type']).sum()).reset_index()
predictInputDF['stars'] = round(predictInputDF['stars'])

In [15]:
reviewsForOutput = pd.DataFrame(X.groupby(['postal_code','city','ethnic_type'],as_index=False)['review_count'].sum())


In [16]:
XInput=predictInputDF.drop(columns=['stars'])

In [17]:
dummyCategories = pd.get_dummies(XInput.ethnic_type)

In [18]:
dummyCity = pd.get_dummies(XInput.city)

In [19]:
new_review_all = pd.concat([XInput, dummyCategories], axis = 'columns')
new_review_all = pd.concat([new_review_all, dummyCity], axis = 'columns')

In [20]:
final_PC = new_review_all.drop(['city', 'ethnic_type'], axis = 'columns')
X=final_PC
X_test_data= X

In [21]:
dummyCategories1 = pd.get_dummies(reviewsForOutput.ethnic_type)
dummyCity1 = pd.get_dummies(reviewsForOutput.city)
finalOutputX = pd.concat([reviewsForOutput, dummyCategories1], axis = 'columns')
finalOutputX = pd.concat([finalOutputX, dummyCity1], axis = 'columns')
finalOutputX = finalOutputX.drop(['city', 'ethnic_type'], axis = 'columns')
reviewsForOutput=reviewsForOutput.drop(columns=['review_count'])

In [22]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, yDF, random_state=1, stratify=y)

In [23]:
X_train

Unnamed: 0,postal_code,review_count,American,Asian_Fusion,British,Chinese,French,Greek,Hawaiian,Indian,...,Japanese,Korean,Mediterranean,Mexican,Others,Spanish,Thai,Vietnamese,Las Vegas,Phoenix
3130,85053,258,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2821,89169,370,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3124,85051,31,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
706,89183,14,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1652,85044,3,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2876,85006,21,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2407,85054,659,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3033,85031,99,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3046,85033,84,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [24]:
#Scale and transform
X_train, X_test, y_train, y_test = train_test_split(X, yDF, test_size=0.33, random_state=1)
scaler = StandardScaler()
x_scaler=scaler.fit(X_train)
X_train_scaled = x_scaler.transform(X_train)
X_test_scaled = x_scaler.transform(X_test)
x_test_data_scaled = x_scaler.transform(finalOutputX)                     

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [26]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=10)

In [27]:
rf_model = rf_model.fit(X_train_scaled, y_train)

  """Entry point for launching an IPython kernel.


In [28]:
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.235


In [29]:
y_pred = rf_model.predict(X_test_scaled)

In [30]:
ynew = rf_model.predict(x_test_data_scaled)

In [31]:
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.235


In [32]:
reviewsForOutput

Unnamed: 0,postal_code,city,ethnic_type
0,85001,Phoenix,American
1,85001,Phoenix,Italian
2,85001,Phoenix,Mexican
3,85001,Phoenix,Spanish
4,85003,Phoenix,American
...,...,...,...
730,89183,Las Vegas,Japanese
731,89183,Las Vegas,Mediterranean
732,89183,Las Vegas,Mexican
733,89183,Las Vegas,Thai


In [33]:
reviewsForOutput['prediction']=ynew


In [34]:
reviewsForOutput.to_sql(name='review_prediction', con=engine, if_exists='replace' ,index=True)

In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.27      0.25      0.26       238
           2       0.22      0.21      0.21       233
           3       0.22      0.23      0.22       241
           4       0.22      0.25      0.23       232
           5       0.25      0.25      0.25       234

    accuracy                           0.24      1178
   macro avg       0.24      0.24      0.24      1178
weighted avg       0.24      0.24      0.24      1178

