In [25]:
# Load the required dependencies
import hvplot.pandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor

from scipy import stats
from scipy.stats import linregress
from scipy.stats import f_oneway

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError


from pandas.plotting import scatter_matrix

pd.set_option('display.max_columns', 150, 'display.max_rows', 255)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# Read in the File and preview it.
file_path = Path("secondary_housing_data.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,MSSubClass,LotArea,OverallCond,Exterior1st,TotalBsmtSF,CentralAir,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,GarageArea,Neighborhood,Age,SalePrice
0,60,8450,5,VinylSd,856,Y,1710,2,1,3,1,8,548,CollgCr,5,208500
1,20,9600,8,MetalSd,1262,Y,1262,2,0,3,1,6,460,Veenker,31,181500
2,60,11250,5,VinylSd,920,Y,1786,2,1,3,1,6,608,CollgCr,7,223500
3,70,9550,5,Wd Sdng,756,Y,1717,1,0,3,1,7,642,Crawfor,91,140000
4,60,14260,5,VinylSd,1145,Y,2198,2,1,4,1,9,836,NoRidge,8,250000


In [3]:
# Create a copy of the dataframe to use in neural networks
nn_df = df.copy()
# nn_df = nn_df.drop(columns=['Neighborhood'])
nn_df.head()

Unnamed: 0,MSSubClass,LotArea,OverallCond,Exterior1st,TotalBsmtSF,CentralAir,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,GarageArea,Neighborhood,Age,SalePrice
0,60,8450,5,VinylSd,856,Y,1710,2,1,3,1,8,548,CollgCr,5,208500
1,20,9600,8,MetalSd,1262,Y,1262,2,0,3,1,6,460,Veenker,31,181500
2,60,11250,5,VinylSd,920,Y,1786,2,1,3,1,6,608,CollgCr,7,223500
3,70,9550,5,Wd Sdng,756,Y,1717,1,0,3,1,7,642,Crawfor,91,140000
4,60,14260,5,VinylSd,1145,Y,2198,2,1,4,1,9,836,NoRidge,8,250000


In [4]:
# nn_df = nn_df.drop(columns='Id', axis=1)
nn_df.columns

Index(['MSSubClass', 'LotArea', 'OverallCond', 'Exterior1st', 'TotalBsmtSF',
       'CentralAir', 'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'Neighborhood', 'Age',
       'SalePrice'],
      dtype='object')

In [5]:
# Convert categorical data to numeric with `pd.get_dummies`
dummy =  nn_df.dtypes[nn_df.dtypes == "object"].index.tolist()
df_dummies = pd.get_dummies(nn_df, columns=dummy)

df_dummies.head()

Unnamed: 0,MSSubClass,LotArea,OverallCond,TotalBsmtSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,GarageArea,Age,SalePrice,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,CentralAir_N,CentralAir_Y,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker
0,60,8450,5,856,1710,2,1,3,1,8,548,5,208500,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,20,9600,8,1262,1262,2,0,3,1,6,460,31,181500,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,60,11250,5,920,1786,2,1,3,1,6,608,7,223500,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,70,9550,5,756,1717,1,0,3,1,7,642,91,140000,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,60,14260,5,1145,2198,2,1,4,1,9,836,8,250000,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [6]:
# Calculate Z-values
df_dummies['price_scaled'] = np.abs(stats.zscore(df_dummies['SalePrice']))

# Filter out outliers
df_dummies = df_dummies[df_dummies['price_scaled'] <= 2.5]

# Drop the column
df_dummies = df_dummies.drop('price_scaled', axis=1)

In [7]:
df_dummies.describe()

Unnamed: 0,MSSubClass,LotArea,OverallCond,TotalBsmtSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,GarageArea,Age,SalePrice,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,CentralAir_N,CentralAir_Y,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker
count,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0,1420.0
mean,57.28,10325.43,5.59,1033.47,1484.81,1.55,0.37,2.86,1.05,6.44,463.19,37.33,173066.06,0.01,0.0,0.0,0.03,0.0,0.04,0.15,0.0,0.15,0.08,0.0,0.02,0.35,0.14,0.02,0.07,0.93,0.01,0.0,0.01,0.04,0.02,0.1,0.03,0.07,0.06,0.03,0.01,0.03,0.16,0.01,0.05,0.02,0.04,0.08,0.02,0.05,0.04,0.06,0.01,0.03,0.01
std,42.65,9899.16,1.11,413.12,486.66,0.54,0.5,0.81,0.22,1.56,206.86,30.1,63183.89,0.12,0.03,0.04,0.18,0.03,0.19,0.36,0.03,0.36,0.27,0.04,0.13,0.48,0.35,0.13,0.25,0.25,0.11,0.04,0.11,0.2,0.14,0.31,0.18,0.26,0.23,0.16,0.11,0.18,0.37,0.08,0.22,0.15,0.2,0.27,0.13,0.22,0.2,0.24,0.11,0.16,0.08
min,20.0,1300.0,1.0,0.0,334.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,34900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,7500.0,5.0,792.75,1122.75,1.0,0.0,2.0,1.0,5.0,318.75,8.0,129000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50.0,9362.0,5.0,978.5,1443.5,2.0,0.0,3.0,1.0,6.0,473.0,36.0,160000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,70.0,11376.25,6.0,1261.25,1740.5,2.0,1.0,3.0,1.0,7.0,576.0,55.0,207125.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,190.0,215245.0,9.0,6110.0,5642.0,3.0,2.0,8.0,3.0,14.0,1418.0,136.0,378500.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Split our preprocessed data into our features and target arrays
y = df_dummies['SalePrice'].values
X = df_dummies.drop(['SalePrice'],axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
y_train.min()

34900

In [10]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
X_test_scaled[:5]

array([[-0.15071694, -0.39846324,  0.37015124, -0.5320186 , -0.63461758,
        -1.01896991, -0.74640987,  0.18408663, -0.211409  , -0.27565917,
         0.29911574,  1.15236324, -0.11952286, -0.02968261, -0.04199605,
        -0.18855106, -0.02968261, -0.19104018, -0.43963315, -0.02968261,
        -0.42529188, -0.28438593, -0.02968261, -0.13042181, -0.72869562,
         2.46981781, -0.13042181, -0.26396941,  0.26396941, -0.09423903,
        -0.04199605, -0.10332549,  4.86781801, -0.13723732, -0.3404608 ,
        -0.19350174, -0.26776503, -0.25232997, -0.16185465, -0.09423903,
        -0.19593679, -0.45381141, -0.0664896 , -0.22771002, -0.15603287,
        -0.21004201, -0.29332928, -0.11952286, -0.23195547, -0.20774711,
        -0.25035055, -0.11952286, -0.16185465, -0.08421519],
       [ 0.08741164,  0.11636039,  0.37015124, -0.80572745,  0.55773183,
        -1.01896991,  3.2913121 ,  0.18408663, -0.211409  ,  0.37946388,
         0.35331667,  0.05647808, -0.11952286, -0.02968261, -0.

In [12]:
from tensorflow.keras import backend as K
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  50
hidden_nodes_layer2 = 55
hidden_nodes_layer3 = 52

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                2750      
                                                                 
 dense_1 (Dense)             (None, 55)                2805      
                                                                 
 dense_2 (Dense)             (None, 52)                2912      
                                                                 
 dense_3 (Dense)             (None, 1)                 53        
                                                                 
Total params: 8520 (33.28 KB)
Trainable params: 8520 (33.28 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
# Compile the model
nn.compile(loss='mse', optimizer="adam", metrics=['accuracy', rmse])

In [15]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [16]:
# Evaluate the model using the test data
model_loss = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Evaluation: {model_loss}")

9/9 - 0s - loss: 945949888.0000 - accuracy: 0.0000e+00 - rmse: 29156.2500 - 234ms/epoch - 26ms/step
Evaluation: [945949888.0, 0.0, 29156.25]


In [17]:
y_pred = nn.predict(X_test)
# print(MeanSquaredError(y_test, y_pred).numpy())
# print(f'y_actual: {y_test}, y_pred:{y_pred}')
y_pred[:10]



array([[8.2781712e+07],
       [1.3653701e+08],
       [1.7279056e+08],
       [1.1250097e+08],
       [1.5405232e+08],
       [5.9047716e+07],
       [8.6634072e+07],
       [1.4596797e+08],
       [1.0861210e+08],
       [1.4573250e+08]], dtype=float32)

In [28]:
# # Step 1: Connect to MongoDB
# import pymongo
# client = pymongo.MongoClient("mongodb://localhost:27017/")  # Replace with your MongoDB connection string
# db = client["home_price_prediction"]  # Replace with your database name

In [29]:
# # Step 2: Insert data into MongoDB collection
# collection = db["df"]  # Replace with your collection name
# data_records = df.to_dict(orient='records')
# collection.insert_many(data_records)

<pymongo.results.InsertManyResult at 0x237e97acc70>