In [1]:
#Random forest and DeepLearning
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
from sqlalchemy import create_engine

In [2]:
# The cleaned data is loaded into postgres database. It is also formatted in the format required for ML during transformation process.
# We are trying random forest classifier as the data will be divided into smaller sets and prediction could be near to accuracy
# We are also adding deep learning to get more neural network predition
# Based on the line identified, the output variable will be predicted for the input vairable
# Once the complete dataset is loaded and the accuracy is identified, we will pick the best approch. This should be sometime in next session


In [3]:
#Pull data from busiensses table from postgres
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost/Yelp')


In [4]:
ReviewsDF = pd.read_sql('select * from reviews r, businesses b where b.business_id = r.business_id',engine)


In [5]:
ReviewsDF.drop(['business_id','review_id','user_id','date','name','review_count'], axis=1, inplace=True)
ReviewsDF.head()


Unnamed: 0,stars,useful,city,state,postal_code,category
0,1,3,Phoenix,AZ,85022,American
1,2,6,Phoenix,AZ,85022,American
2,1,2,Phoenix,AZ,85022,American
3,1,1,Phoenix,AZ,85022,American
4,1,0,Phoenix,AZ,85022,American


In [6]:
ReviewsDF.drop(['useful'], axis=1, inplace=True)
ReviewsDF.head()

Unnamed: 0,stars,city,state,postal_code,category
0,1,Phoenix,AZ,85022,American
1,2,Phoenix,AZ,85022,American
2,1,Phoenix,AZ,85022,American
3,1,Phoenix,AZ,85022,American
4,1,Phoenix,AZ,85022,American


In [7]:
# Generate our categorical variable list
reviewCat = ReviewsDF.dtypes[ReviewsDF.dtypes == "object"].index.tolist()
ReviewsDF[reviewCat].nunique()


city             1
state            1
postal_code     65
category       511
dtype: int64

In [8]:
categoryCounts=ReviewsDF.category.value_counts()
categoryCounts

American                                          142196
Mexican                                            48188
Italian                                            34872
Chinese                                            12051
Japanese                                           12048
                                                   ...  
Restaurants,Coffee & Tea,Food,Sandwiches,Cafes         3
Delis,Restaurants,Diners                               3
Gluten-Free,Food,Bakeries,Desserts,Restaurants         3
Delis,Food,Restaurants,Bagels                          3
Restaurants,Middle Eastern,Halal                       3
Name: category, Length: 511, dtype: int64

In [9]:
replace_type=list(categoryCounts[categoryCounts<2000].index)

In [10]:
for application in replace_type:
    ReviewsDF.category =  ReviewsDF.category.replace(application,"Others")
ReviewsDF.head()   


Unnamed: 0,stars,city,state,postal_code,category
0,1,Phoenix,AZ,85022,American
1,2,Phoenix,AZ,85022,American
2,1,Phoenix,AZ,85022,American
3,1,Phoenix,AZ,85022,American
4,1,Phoenix,AZ,85022,American


In [59]:
#singleInstanceReviewsDF = ReviewsDF[:0,]
singleInstanceReviewsDF = ReviewsDF.iloc[0]
singleInstanceReviewsDF.head()

stars                1.0
city_Phoenix         1.0
state_AZ             1.0
postal_code_         0.0
postal_code_85001    0.0
Name: 0, dtype: float64

In [11]:
ReviewsDF.category.value_counts()


American         142196
Others            53677
Mexican           48188
Italian           34872
Chinese           12051
Japanese          12048
Mediterranean      7434
Thai               6332
SeaFood            4714
Vietnamese         4213
Indian             3447
Hawaiian           2228
Name: category, dtype: int64

In [12]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [13]:
# Fit and transform the OneHotEncoder using the categorical variable list
encodeDF = pd.DataFrame(enc.fit_transform(ReviewsDF[reviewCat]))

In [None]:
def PredictUsingModel(inputDF , encodeDF, Xscaler, classif):
    inputDF = inputDF.merge(encodeDF,left_index=True, right_index=True)
    inputDF = Xscaler.transform(inputDF) 
    return classif.predict(inputDF)

In [14]:
# Add the encoded variable names to the DataFrame
encodeDF.columns = enc.get_feature_names(reviewCat)
encodeDF.head()


Unnamed: 0,city_Phoenix,state_AZ,postal_code_,postal_code_85001,postal_code_85003,postal_code_85004,postal_code_85005,postal_code_85006,postal_code_85007,postal_code_85008,...,category_Hawaiian,category_Indian,category_Italian,category_Japanese,category_Mediterranean,category_Mexican,category_Others,category_SeaFood,category_Thai,category_Vietnamese
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Merge one-hot encoded features and drop the originals
ReviewsDF = ReviewsDF.merge(encodeDF,left_index=True, right_index=True)
ReviewsDF = ReviewsDF.drop(reviewCat,1)
ReviewsDF.head()


Unnamed: 0,stars,city_Phoenix,state_AZ,postal_code_,postal_code_85001,postal_code_85003,postal_code_85004,postal_code_85005,postal_code_85006,postal_code_85007,...,category_Hawaiian,category_Indian,category_Italian,category_Japanese,category_Mediterranean,category_Mexican,category_Others,category_SeaFood,category_Thai,category_Vietnamese
0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
y = ReviewsDF.stars


In [17]:
X=ReviewsDF.drop(columns=['stars'])


In [18]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)


In [19]:
# Create a StandardScaler instance
scaler = StandardScaler()


In [20]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)


In [25]:
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [26]:
X_test_scaled

array([[ 0.        ,  0.        , -0.02065564, ..., -0.12021868,
        -0.13895173, -0.11389646],
       [ 0.        ,  0.        , -0.02065564, ..., -0.12021868,
        -0.13895173, -0.11389646],
       [ 0.        ,  0.        , -0.02065564, ..., -0.12021868,
        -0.13895173, -0.11389646],
       ...,
       [ 0.        ,  0.        , -0.02065564, ..., -0.12021868,
        -0.13895173, -0.11389646],
       [ 0.        ,  0.        , -0.02065564, ..., -0.12021868,
        -0.13895173, -0.11389646],
       [ 0.        ,  0.        , -0.02065564, ...,  8.31817505,
        -0.13895173, -0.11389646]])

In [48]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=30)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)


In [60]:
def PredictUsingModel(inputDF , encodeDF, Xscaler, model):
    inputDF = inputDF.merge(encodeDF,left_index=True, right_index=True)
    inputDF = Xscaler.transform(inputDF) 
    return model.predict(inputDF)

In [None]:
singleInstanceReviewsDF = pd.DataFrame['1','Phoenix','AZ','85001','85001']

In [46]:
# inversed_train=scaler.inverse_transform(X_train_scaled)
# inversed_test= scaler.inverse_transform(X_test_scaled)

In [47]:
#df = pd.DataFrame(data=inversed_train, index=["row1", "row2"], columns=["column1", "column2"])

ValueError: Shape of passed values is (248550, 79), indices imply (2, 2)

In [49]:
#Get dataset ready for Database load
Output_df= pd.DataFrame(y_pred)
Output_df = Output_df.rename(columns={0:'Predition'})

In [50]:
 X_df = pd.DataFrame(X_test_scaled)
 X_df = X_df.rename(columns={0:'Input'})
 X_df.head()

Unnamed: 0,Input,1,2,3,4,5,6,7,8,9,...,69,70,71,72,73,74,75,76,77,78
0,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,4.858148,-0.091037,-0.118397,...,-0.082518,-0.103136,-0.34216,-0.194511,-0.151127,-0.413024,-0.43887,-0.120219,-0.138952,-0.113896
1,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,-0.20584,-0.091037,-0.118397,...,-0.082518,-0.103136,-0.34216,-0.194511,-0.151127,2.421165,-0.43887,-0.120219,-0.138952,-0.113896
2,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,-0.20584,-0.091037,-0.118397,...,-0.082518,-0.103136,2.922607,-0.194511,-0.151127,-0.413024,-0.43887,-0.120219,-0.138952,-0.113896
3,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,-0.20584,-0.091037,-0.118397,...,-0.082518,-0.103136,-0.34216,-0.194511,-0.151127,-0.413024,2.278579,-0.120219,-0.138952,-0.113896
4,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,-0.20584,-0.091037,-0.118397,...,-0.082518,-0.103136,-0.34216,-0.194511,-0.151127,2.421165,-0.43887,-0.120219,-0.138952,-0.113896


In [51]:
 X_df['Predition']=Output_df['Predition']
 X_df.head()

Unnamed: 0,Input,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,Predition
0,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,4.858148,-0.091037,-0.118397,...,-0.103136,-0.34216,-0.194511,-0.151127,-0.413024,-0.43887,-0.120219,-0.138952,-0.113896,5
1,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,-0.20584,-0.091037,-0.118397,...,-0.103136,-0.34216,-0.194511,-0.151127,2.421165,-0.43887,-0.120219,-0.138952,-0.113896,5
2,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,-0.20584,-0.091037,-0.118397,...,-0.103136,2.922607,-0.194511,-0.151127,-0.413024,-0.43887,-0.120219,-0.138952,-0.113896,5
3,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,-0.20584,-0.091037,-0.118397,...,-0.103136,-0.34216,-0.194511,-0.151127,-0.413024,2.278579,-0.120219,-0.138952,-0.113896,5
4,0.0,0.0,-0.020656,-0.012036,-0.215822,-0.346014,-0.028093,-0.20584,-0.091037,-0.118397,...,-0.103136,-0.34216,-0.194511,-0.151127,2.421165,-0.43887,-0.120219,-0.138952,-0.113896,1


In [52]:
from sqlalchemy.orm import Session
session = Session(engine)


In [55]:
#Import data into postgres
X_df.to_sql(name='output_table', con=engine, if_exists='replace' ,index=False)

In [30]:
# # Define the model - deep neural net
# number_input_features = len(X_train_scaled[0])
# hidden_nodes_layer1 =  10
# hidden_nodes_layer2 = 3

# nn = tf.keras.models.Sequential()
# # First hidden layer
# nn.add(
#     tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
# )
# # Second hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# # Output layer
# nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


In [31]:
# # Compile the Sequential model together and customize metrics
# nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [32]:
# # Train the model
# fit_model = nn.fit(X_train_scaled, y_train, epochs=50)


Train on 248550 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# # make a prediction
# ynew = nn.predict_classes(X_test_scaled)


In [None]:
# #Get dataset ready for Database load
# Output_deep_df= pd.DataFrame(ynew)
# Output_deep_df = Output_deep_df.rename(columns={0:'RunPredition'})


In [None]:
# XScaled_df = pd.DataFrame(X_test_scaled)
# XScaled_df = XScaled_df.rename(columns={0:'Input'})
# XScaled_df.head()



In [None]:
# XScaled_df['RunPredition']=Output_deep_df['RunPredition']
# XScaled_df.head()


In [None]:
# #Import data into postgres
# XScaled_df.to_sql(name='output_deep', con=engine, if_exists='replace' ,index=False)

In [34]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


82850/82850 - 1s - loss: -9.5236e+09 - accuracy: 0.1074
Loss: -9523580375.880459, Accuracy: 0.10736270248889923
