In [1]:
# Load the required dependencies
import hvplot.pandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor

from scipy import stats
from scipy.stats import linregress
from scipy.stats import f_oneway

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError


from pandas.plotting import scatter_matrix

pd.set_option('display.max_columns', 150, 'display.max_rows', 255)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
import psycopg2
DATABASE_URI = 'postgresql://postgres:Masia0502@localhost:5432/home_price_post_db'
connection = psycopg2.connect(DATABASE_URI)

In [3]:
# Replace 'table_name' with the name of the table you want to import
table_name = 'post_home_prices_15column'

# Use the 'pandas.read_sql()' function to import the table into a DataFrame
df = pd.read_sql(f"SELECT * FROM {table_name}", connection)

# Close the database connection
connection.close()

  df = pd.read_sql(f"SELECT * FROM {table_name}", connection)


In [4]:
# # Read in the File and preview it.
# file_path = Path("housingdata_15columns.csv")
# df = pd.read_csv(file_path)
# df.head()

In [5]:
# Create a copy of the dataframe to use in neural networks
nn_df = df.copy()
# nn_df = nn_df.drop(columns=["LotAreaBin","GrLivAreaBin","LotAreaCode"])
nn_df.head()

Unnamed: 0,MSSubClass,LotArea,Utilities,OverallCond,RoofMatl,Foundation,CentralAir,GrLivArea,KitchenAbvGr,TotRmsAbvGrd,GarageType,GarageArea,PavedDrive,SalePrice,Age
0,60,8450,AllPub,5,CompShg,PConc,Y,1710,1,8,Attchd,548,Y,208500,5
1,20,9600,AllPub,8,CompShg,CBlock,Y,1262,1,6,Attchd,460,Y,181500,31
2,60,11250,AllPub,5,CompShg,PConc,Y,1786,1,6,Attchd,608,Y,223500,7
3,70,9550,AllPub,5,CompShg,BrkTil,Y,1717,1,7,Detchd,642,Y,140000,91
4,60,14260,AllPub,5,CompShg,PConc,Y,2198,1,9,Attchd,836,Y,250000,8


In [6]:
# Convert categorical data to numeric with `pd.get_dummies`
dummy =  nn_df.dtypes[nn_df.dtypes == "object"].index.tolist()
df_dummies = pd.get_dummies(nn_df, columns=dummy)

df_dummies.head()

Unnamed: 0,MSSubClass,LotArea,OverallCond,GrLivArea,KitchenAbvGr,TotRmsAbvGrd,GarageArea,SalePrice,Age,Utilities_AllPub,Utilities_NoSeWa,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,CentralAir_N,CentralAir_Y,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,PavedDrive_N,PavedDrive_P,PavedDrive_Y
0,60,8450,5,1710,1,8,548,208500,5,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1
1,20,9600,8,1262,1,6,460,181500,31,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1
2,60,11250,5,1786,1,6,608,223500,7,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1
3,70,9550,5,1717,1,7,642,140000,91,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1
4,60,14260,5,2198,1,9,836,250000,8,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1


In [7]:
# Calculate Z-values
df_dummies['price_scaled'] = np.abs(stats.zscore(df_dummies['SalePrice']))

# Filter out outliers
df_dummies = df_dummies[df_dummies['price_scaled'] <= 2.5]

# Drop the column
df_dummies = df_dummies.drop('price_scaled', axis=1)

In [8]:
df_dummies.describe()

Unnamed: 0,MSSubClass,LotArea,OverallCond,GrLivArea,KitchenAbvGr,TotRmsAbvGrd,GarageArea,SalePrice,Age,Utilities_AllPub,Utilities_NoSeWa,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,CentralAir_N,CentralAir_Y,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,PavedDrive_N,PavedDrive_P,PavedDrive_Y
count,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0
mean,57.28,10325.43,5.59,1484.81,1.05,6.44,463.19,173066.06,37.33,1.0,0.0,0.0,0.98,0.0,0.0,0.0,0.01,0.0,0.0,0.1,0.44,0.43,0.02,0.0,0.0,0.07,0.93,0.0,0.59,0.01,0.05,0.01,0.27,0.06,0.02,0.92
std,42.65,9899.16,1.11,486.66,0.22,1.56,206.86,63183.89,30.1,0.03,0.03,0.03,0.13,0.03,0.03,0.03,0.09,0.06,0.05,0.3,0.5,0.5,0.13,0.06,0.05,0.25,0.25,0.06,0.49,0.11,0.23,0.08,0.45,0.24,0.14,0.28
min,20.0,1300.0,1.0,334.0,0.0,2.0,0.0,34900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,7500.0,5.0,1122.75,1.0,5.0,318.75,129000.0,8.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,50.0,9362.0,5.0,1443.5,1.0,6.0,473.0,160000.0,36.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,70.0,11376.25,6.0,1740.5,1.0,7.0,576.0,207125.0,55.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,190.0,215245.0,9.0,5642.0,3.0,14.0,1418.0,378500.0,136.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Split our preprocessed data into our features and target arrays
y = df_dummies['SalePrice'].values
X = df_dummies.drop(['SalePrice'],axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
y_train.min()

34900

In [11]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
X_test_scaled[:5]

array([[-0.15071694, -0.39846324,  0.37015124, -0.63461758, -0.211409  ,
        -0.27565917,  0.29911574,  1.15236324,  0.02968261, -0.02968261,
        -0.02968261,  0.13386989, -0.02968261, -0.02968261, -0.02968261,
        -0.09423903, -0.05944383, -0.04199605, -0.32907854, -0.89301084,
         1.14204187, -0.13386989, -0.0664896 , -0.05145714, -0.26396941,
         0.26396941, -0.0664896 , -1.22789506, -0.11170371, -0.24433889,
        -0.08421519,  1.66180306, -0.25819889, -0.1405299 ,  0.29860654],
       [ 0.08741164,  0.11636039,  0.37015124,  0.55773183, -0.211409  ,
         0.37946388,  0.35331667,  0.05647808,  0.02968261, -0.02968261,
        -0.02968261,  0.13386989, -0.02968261, -0.02968261, -0.02968261,
        -0.09423903, -0.05944383, -0.04199605, -0.32907854,  1.11980724,
        -0.87562464, -0.13386989, -0.0664896 , -0.05145714, -0.26396941,
         0.26396941, -0.0664896 ,  0.81440185, -0.11170371, -0.24433889,
        -0.08421519, -0.60175602, -0.25819889, -0.

In [13]:
from tensorflow.keras import backend as K
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [14]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  50
hidden_nodes_layer2 = 50
hidden_nodes_layer3 = 35

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                1800      
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dense_2 (Dense)             (None, 35)                1785      
                                                                 
 dense_3 (Dense)             (None, 1)                 36        
                                                                 
Total params: 6171 (24.11 KB)
Trainable params: 6171 (24.11 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
# Compile the model
nn.compile(loss='mse', optimizer="adam", metrics=['accuracy', rmse])

In [16]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [17]:
# Evaluate the model using the test data
model_loss= nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}")

9/9 - 0s - loss: 1289234304.0000 - accuracy: 0.0000e+00 - rmse: 33507.4219 - 133ms/epoch - 15ms/step
Loss: [1289234304.0, 0.0, 33507.421875]


In [18]:
y_pred = nn.predict(X_test)
# print(MeanSquaredError(y_test, y_pred).numpy())

# print(f'y_actual: {y_test}, y_pred:{y_pred}')
y_pred[:5]



array([[1.1139787e+08],
       [1.8563227e+08],
       [2.1738264e+08],
       [1.4939536e+08],
       [2.0984285e+08]], dtype=float32)