In [1]:
import pandas as pd
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')
data = pd.read_csv("/content/gdrive/MyDrive/NYC-sale-project/dataFinal.csv")
df = data.copy()

Mounted at /content/gdrive/


In [3]:
def normalization (df):
  df['SALE PRICE'] = np.log(df['SALE PRICE'])
  df['GROSS SQUARE FEET'] = np.log(df['GROSS SQUARE FEET'])
  df['LAND SQUARE FEET'] = np.log(df['LAND SQUARE FEET'])
  df['RESIDENTIAL UNITS'] = 1/(df['RESIDENTIAL UNITS']+1)
  df['COMMERCIAL UNITS'] = 1/(df['COMMERCIAL UNITS']+1)
  df['TOTAL UNITS'] = 1/(df['TOTAL UNITS']+1)

  return df

In [4]:
df = normalization(df)

In [5]:
features = df[['RESIDENTIAL UNITS', 'COMMERCIAL UNITS','TOTAL UNITS', 'LAND SQUARE FEET',
            'GROSS SQUARE FEET', 'YEAR BUILT', 'LATITUDE', 'LONGITUDE']]

dummy = df[['BUILDING CLASS AT TIME OF SALE','TAX CLASS AT TIME OF SALE']]

df_dummies = pd.get_dummies(dummy, columns=['BUILDING CLASS AT TIME OF SALE','TAX CLASS AT TIME OF SALE'])

X = pd.concat([features, df_dummies], axis=1)

In [6]:
Y = df[['SALE PRICE']]

In [7]:
X1 = pd.DataFrame()
not_tree_feature = ['RESIDENTIAL UNITS', 'COMMERCIAL UNITS','TOTAL UNITS', 'YEAR BUILT', 'LATITUDE', 'LONGITUDE']

for i in X.columns:
  if i not in not_tree_feature:
    X1 = pd.concat([X1, X[i]], axis=1)


In [9]:
X1.columns

Index(['LAND SQUARE FEET', 'GROSS SQUARE FEET',
       'BUILDING CLASS AT TIME OF SALE_A0',
       'BUILDING CLASS AT TIME OF SALE_A1',
       'BUILDING CLASS AT TIME OF SALE_A2',
       'BUILDING CLASS AT TIME OF SALE_A3',
       'BUILDING CLASS AT TIME OF SALE_A4',
       'BUILDING CLASS AT TIME OF SALE_A5',
       'BUILDING CLASS AT TIME OF SALE_A6',
       'BUILDING CLASS AT TIME OF SALE_A7',
       ...
       'BUILDING CLASS AT TIME OF SALE_W3',
       'BUILDING CLASS AT TIME OF SALE_W4',
       'BUILDING CLASS AT TIME OF SALE_W8',
       'BUILDING CLASS AT TIME OF SALE_W9',
       'BUILDING CLASS AT TIME OF SALE_Y3',
       'BUILDING CLASS AT TIME OF SALE_Z0',
       'BUILDING CLASS AT TIME OF SALE_Z9', 'TAX CLASS AT TIME OF SALE_1',
       'TAX CLASS AT TIME OF SALE_2', 'TAX CLASS AT TIME OF SALE_4'],
      dtype='object', length=135)

In [8]:
X2 = pd.DataFrame()
reg_feature = ['RESIDENTIAL UNITS', 'COMMERCIAL UNITS','TOTAL UNITS', 'YEAR BUILT', 'LATITUDE', 'LONGITUDE','LAND SQUARE FEET', 'GROSS SQUARE FEET']

for i in X.columns:
  if i in reg_feature:
    #X2[i] = X[i]
    X2 = pd.concat([X2, X[i]], axis=1)


In [9]:
# Preprocessing allows us to standarsize our data
from sklearn import preprocessing
# Allows us to split our data into training and testing data
from sklearn.model_selection import train_test_split

In [10]:
tree_scale = preprocessing.StandardScaler()
reg_scale = preprocessing.StandardScaler()

tree_scale.fit(X1)
reg_scale.fit(X2)

In [11]:
scaler = preprocessing.StandardScaler()
scaler.fit(X)


In [12]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.1,random_state=42)
X_BTrain,X_STrain,Y_BTrain,Y_STrain = train_test_split(X_train,Y_train,test_size=0.3,random_state=42)

In [13]:
X1_BTrain = pd.DataFrame()

not_tree_feature = ['RESIDENTIAL UNITS', 'COMMERCIAL UNITS','TOTAL UNITS', 'YEAR BUILT', 'LATITUDE', 'LONGITUDE']

for i in X_BTrain.columns:
  if i not in not_tree_feature:
    #X1_BTrain[i] = X_BTrain[i]
    X1_BTrain = pd.concat([X1_BTrain, X_BTrain[i]], axis=1)

In [14]:
X2_BTrain = pd.DataFrame()
reg_feature = ['RESIDENTIAL UNITS', 'COMMERCIAL UNITS','TOTAL UNITS', 'YEAR BUILT', 'LATITUDE', 'LONGITUDE','LAND SQUARE FEET', 'GROSS SQUARE FEET']

for i in X_BTrain.columns:
  if i in reg_feature:
    #X2_BTrain[i] = X_BTrain[i]
    X2_BTrain = pd.concat([X2_BTrain, X_BTrain[i]], axis=1)


In [15]:
X1_BTrain = tree_scale.transform(X1_BTrain)
X2_BTrain = reg_scale.transform(X2_BTrain)
X_BTrain = scaler.transform(X_BTrain)

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


In [17]:
Y_BTrain = Y_BTrain.values

In [18]:
Y_BTrain.shape

(22691, 1)

In [19]:
Y_BTrain = Y_BTrain.ravel()
Y_BTrain.shape

(22691,)

In [20]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Random Forest regressor
rfr = RandomForestRegressor()

# Initialize GridSearchCV
grid_search_tree = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=3)

# Fit GridSearchCV to the dataset
grid_search_tree.fit(X1_BTrain, Y_BTrain)

# Print best parameters and score
print("Best Parameters: ", grid_search_tree.best_params_)
print("Best Score: ", grid_search_tree.best_score_)

Best Parameters:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best Score:  0.5525372061985772


In [21]:
import pickle 
filename = 'tree_model.sav'

pickle.dump(grid_search_tree, open(filename, 'wb'))

In [22]:
X1_STrain = pd.DataFrame()

not_tree_feature = ['RESIDENTIAL UNITS', 'COMMERCIAL UNITS','TOTAL UNITS', 'YEAR BUILT', 'LATITUDE', 'LONGITUDE']

for i in X_STrain.columns:
  if i not in not_tree_feature:
    #X1_STrain[i] = X_STrain[i]
    X1_STrain = pd.concat([X1_STrain, X_STrain[i]], axis=1)


In [23]:
X1_STrain = tree_scale.transform(X1_STrain)

In [87]:
pickled_tree_model = pickle.load(open('tree_model.sav', 'rb'))
tree_pred = pickled_tree_model.predict(X1_STrain)
tree_pred

array([14.4033637 , 13.19993428, 13.33800451, ..., 13.16671444,
       14.4033637 , 14.0287349 ])

In [29]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np

In [31]:
poly_reg = Pipeline([
        ('poly', PolynomialFeatures()),
        ('lin', LinearRegression())
    ])

# Define parameter grid
param_grid = {
    'poly__degree': [1, 2, 3, 4,5,6,7,8,9,10]
}

# Initialize GridSearchCV
grid_search_reg = GridSearchCV(poly_reg, param_grid=param_grid, cv=3)

# Fit GridSearchCV to the dataset
grid_search_reg.fit(X2_BTrain, Y_BTrain)

# Print best parameters and score
print("Best Parameters: ", grid_search_reg.best_params_)
print("Best Score: ", grid_search_reg.best_score_)

Best Parameters:  {'poly__degree': 3}
Best Score:  0.6516872654675679


In [32]:
import pickle 
filename = 'reg_model.sav'

pickle.dump(grid_search_reg, open(filename, 'wb'))

In [34]:
X2_STrain = pd.DataFrame()
reg_feature = ['RESIDENTIAL UNITS', 'COMMERCIAL UNITS','TOTAL UNITS', 'YEAR BUILT', 'LATITUDE', 'LONGITUDE','LAND SQUARE FEET', 'GROSS SQUARE FEET']

for i in X_STrain.columns:
  if i in reg_feature:
    #X2_STrain[i] = X_STrain[i]
    X2_STrain = pd.concat([X2_STrain, X_STrain[i]], axis=1)


In [35]:
X2_STrain = reg_scale.transform(X2_STrain)

In [36]:
pickled_reg_model = pickle.load(open('reg_model.sav', 'rb'))

In [37]:
reg_pred = pickled_reg_model.predict(X2_STrain)

In [38]:
NN_model = tf.keras.Sequential([
    layers.Dense(50,activation = 'relu'),
    layers.Dense(25,activation = 'relu'),
    layers.Dense(50,activation = 'relu'),
    layers.Dense(units=1)
])


In [39]:
loss = keras.losses.MeanAbsoluteError() # MeanSquaredError
optim = keras.optimizers.Adam(learning_rate=0.01)
early_stop=keras.callbacks.EarlyStopping(monitor='val_loss',patience=10)

In [40]:
NN_model.compile(
    optimizer=optim,
    loss=loss)

In [43]:
X_BTrain = tf.convert_to_tensor(X_BTrain, dtype=tf.float32)
Y_BTrain = tf.convert_to_tensor(Y_BTrain, dtype=tf.float32)

In [45]:
NN_model.fit(
    X_BTrain,Y_BTrain, 
    epochs=1000,
    verbose=1,
    # Calculate validation results on 20% of the training data
    validation_split = 0.2,callbacks=early_stop)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000


<keras.callbacks.History at 0x7f17554d9f40>

In [46]:
import pickle 
filename = 'nn_model.sav'

pickle.dump(NN_model, open(filename, 'wb'))

In [47]:
X_STrain = scaler.transform(X_STrain)

In [48]:
pickled_nn_model = pickle.load(open('nn_model.sav', 'rb'))

In [49]:
nn_pred = NN_model.predict(X_STrain).flatten()



In [50]:
metadata = pd.DataFrame()

metadata['Tree'] = tree_pred 
metadata['Reg'] = reg_pred
metadata['NN'] = nn_pred 
metadata['target'] = Y_STrain.values

In [51]:
metadata

Unnamed: 0,Tree,Reg,NN,target
0,14.403364,14.438041,13.749800,14.184957
1,13.199934,12.938119,13.039120,12.542545
2,13.338005,13.430223,13.039120,14.430696
3,14.028735,14.156516,13.820391,13.387309
4,14.028735,14.185021,13.872414,14.010255
...,...,...,...,...
9721,13.346292,13.271700,13.039120,13.262125
9722,13.112114,13.220320,13.449418,13.567049
9723,13.166714,13.129260,13.039120,13.142166
9724,14.403364,14.298546,14.021898,13.946926


In [54]:
meta_features = pd.DataFrame()
meta_features['Tree'] = tree_pred 
meta_features['Reg'] = reg_pred
meta_features['NN'] = nn_pred 
meta_target = Y_STrain.values

In [55]:
meta_features

Unnamed: 0,Tree,Reg,NN
0,14.403364,14.438041,13.749800
1,13.199934,12.938119,13.039120
2,13.338005,13.430223,13.039120
3,14.028735,14.156516,13.820391
4,14.028735,14.185021,13.872414
...,...,...,...
9721,13.346292,13.271700,13.039120
9722,13.112114,13.220320,13.449418
9723,13.166714,13.129260,13.039120
9724,14.403364,14.298546,14.021898


In [64]:
meta_scale = preprocessing.StandardScaler()
meta_scale.fit(meta_features)

In [66]:
meta_features = meta_scale.transform(meta_features)

In [73]:
poly_reg = Pipeline([
        ('poly', PolynomialFeatures()),
        ('lin', LinearRegression())
    ])

# Define parameter grid
param_grid = {
    'poly__degree': [1, 2, 3, 4,5,6,7,8,9,10],
    'poly__include_bias': [True,False]
}

# Initialize GridSearchCV
grid_search_reg = GridSearchCV(poly_reg, param_grid=param_grid, cv=5)

# Fit GridSearchCV to the dataset
grid_search_reg.fit(meta_features, Y_STrain)

# Print best parameters and score
print("Best Parameters: ", grid_search_reg.best_params_)
print("Best Score: ", grid_search_reg.best_score_)

Best Parameters:  {'poly__degree': 3, 'poly__include_bias': False}
Best Score:  0.6895747548045718


In [74]:
from sklearn.linear_model import Ridge 

ridge_poly_reg = Pipeline([
        ('poly', PolynomialFeatures()),
        ('ridge', Ridge())
    ])

# Define parameter grid
param_grid = {
    'poly__degree': [1, 2, 3, 4,5,6,7,8,9,10],
    'poly__include_bias': [True,False],
    'ridge__alpha':[1,10,100,1000]
}

# Initialize GridSearchCV
grid_search_reg = GridSearchCV(ridge_poly_reg, param_grid=param_grid, cv=5)

# Fit GridSearchCV to the dataset
grid_search_reg.fit(meta_features, Y_STrain)

# Print best parameters and score
print("Best Parameters: ", grid_search_reg.best_params_)
print("Best Score: ", grid_search_reg.best_score_)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

Best Parameters:  {'poly__degree': 3, 'poly__include_bias': False, 'ridge__alpha': 10}
Best Score:  0.6897465519336523


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [78]:
from sklearn.linear_model import Lasso

lasso_poly_reg = Pipeline([
        ('poly', PolynomialFeatures()),
        ('lasso', Lasso())
    ])

# Define parameter grid
param_grid = {
    'poly__degree': [3, 4,5,6,7],
    #'poly__include_bias': [True,False],
    'lasso__alpha':[1000,100,10,1,0.001]
}

# Initialize GridSearchCV
grid_search_reg = GridSearchCV(lasso_poly_reg, param_grid=param_grid, cv=5)

# Fit GridSearchCV to the dataset
grid_search_reg.fit(meta_features, Y_STrain)

# Print best parameters and score
print("Best Parameters: ", grid_search_reg.best_params_)
print("Best Score: ", grid_search_reg.best_score_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best Parameters:  {'lasso__alpha': 0.001, 'poly__degree': 4}
Best Score:  0.6904506207742385


  model = cd_fast.enet_coordinate_descent(


In [77]:
from sklearn.linear_model import ElasticNet

elastic_poly_reg = Pipeline([
        ('poly', PolynomialFeatures()),
        ('elastic', ElasticNet())
    ])

# Define parameter grid
param_grid = {
    'poly__degree': [3, 4,5,6,7],
    'poly__include_bias': [True,False],
    'elastic__alpha':[1,0.1,0.001,0.0001],
    'elastic__l1_ratio':[0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
}

# Initialize GridSearchCV
grid_search_reg = GridSearchCV(elastic_poly_reg, param_grid=param_grid, cv=5)

# Fit GridSearchCV to the dataset
grid_search_reg.fit(meta_features, Y_STrain)

# Print best parameters and score
print("Best Parameters: ", grid_search_reg.best_params_)
print("Best Score: ", grid_search_reg.best_score_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best Parameters:  {'elastic__alpha': 0.001, 'elastic__l1_ratio': 1, 'poly__degree': 4, 'poly__include_bias': True}
Best Score:  0.6904506207742385


  model = cd_fast.enet_coordinate_descent(


In [79]:
NN_model = tf.keras.Sequential([
    layers.Dense(50,activation = 'relu'),
    layers.Dense(25,activation = 'relu'),
    layers.Dense(50,activation = 'relu'),
    layers.Dense(units=1)
])


In [80]:
loss = keras.losses.MeanAbsoluteError() # MeanSquaredError
optim = keras.optimizers.Adam(learning_rate=0.01)
early_stop=keras.callbacks.EarlyStopping(monitor='val_loss',patience=10)

In [81]:
NN_model.compile(
    optimizer=optim,
    loss=loss)

In [82]:
NN_model.fit(
    meta_features, Y_STrain, 
    epochs=1000,
    verbose=1,
    # Calculate validation results on 20% of the training data
    validation_split = 0.2,callbacks=early_stop)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000


<keras.callbacks.History at 0x7f17541fd4f0>

In [83]:
X1_test = pd.DataFrame()
not_tree_feature = ['RESIDENTIAL UNITS', 'COMMERCIAL UNITS','TOTAL UNITS', 'YEAR BUILT', 'LATITUDE', 'LONGITUDE']

for i in X_test.columns:
  if i not in not_tree_feature:
    X1_test = pd.concat([X1_test, X_test[i]], axis=1)


In [84]:
X2_test = pd.DataFrame()
reg_feature = ['RESIDENTIAL UNITS', 'COMMERCIAL UNITS','TOTAL UNITS', 'YEAR BUILT', 'LATITUDE', 'LONGITUDE','LAND SQUARE FEET', 'GROSS SQUARE FEET']

for i in X_test.columns:
  if i in reg_feature:
    #X2[i] = X[i]
    X2_test = pd.concat([X2_test, X_test[i]], axis=1)

In [85]:
X1_test = tree_scale.transform(X1_test)
X2_test = reg_scale.transform(X2_test)
X_test = scaler.transform(X_test)

In [89]:
tree_test = pickled_tree_model.predict(X1_test)
reg_test = pickled_reg_model.predict(X2_test)
nn_test = pickled_nn_model.predict(X_test)



In [92]:
testFeatures = pd.DataFrame()
testFeatures['Tree'] = tree_test
testFeatures['Reg'] = reg_test
testFeatures['NN'] = nn_test

In [93]:
testFeatures = meta_scale.transform(testFeatures)

In [94]:
test_pred = NN_model.predict(testFeatures)



In [98]:
test_pred = test_pred.flatten()

In [99]:
Y_test = Y_test['SALE PRICE'].values

In [100]:


test_result = pd.DataFrame({'Actual': Y_test, 'Predicted': test_pred})
test_result

Unnamed: 0,Actual,Predicted
0,15.868302,13.927697
1,12.971540,12.855931
2,14.210925,13.889179
3,14.422555,14.957450
4,12.691580,12.970245
...,...,...
3597,14.151983,14.874350
3598,16.066802,13.842439
3599,13.670485,13.167841
3600,14.790070,14.458702


In [102]:
from sklearn.metrics import mean_squared_error
import numpy as np
y_test = (Y_test)
y_hat = (test_pred)
rmse = np.sqrt(mean_squared_error(y_test,y_hat))
rmse = float(rmse)
print(rmse)

0.5050419652177942


In [104]:
import plotly.express as px
fig = px.scatter(test_result, x='Actual', y='Predicted', title='Predicted vs Actual')
#fig.add_trace(px.line(x=[df['actual'].min(), df['x'].max()], y=[df['x'].min(), df['x'].max()]).data[0])

# Add the identity line
fig.update_layout(
    shapes=[
        dict(
            type='line',
            xref='x', yref='y',
            x0=test_result['Actual'].min(), y0=test_result['Actual'].min(),
            x1=test_result['Actual'].max(), y1=test_result['Actual'].max(),
            line=dict(color='gray', width=2, dash='dash')
        )
    ]
)


fig.update_layout(
    xaxis_title='Actual Values',
    yaxis_title='Predicted Values',
    height=800,
    width=800
)

fig.show()