In [115]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [116]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
#Removed 6500 S Kildare Ave property after previous analysis showed there was likely a typo in LandArea
file_path = Path("Sold_Inflation_M1_12.csv")
df_Sold_Inflation = pd.read_csv(file_path)


# Review the DataFrame
df_Sold_Inflation.head()

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.1,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.1,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.1,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.1,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.1,5.33,2


In [117]:
df_Sold_Inflation.dtypes

Address           object
City              object
Zip                int64
SoldDate          object
Bathrooms        float64
Bedrooms           int64
LivingArea         int64
LandArea         float64
PropPrice          int64
InflationRate    float64
FederalInt       float64
MonthID            int64
dtype: object

In [118]:
#convert LandArea reported in acres to sqft (anything less than a value of 50)
df_Sold_Inflation['LandArea'] = df_Sold_Inflation['LandArea'].astype(float)
for index, row in df_Sold_Inflation.iterrows():
    if row['LandArea'] < 50:
        df_Sold_Inflation.at[index, 'LandArea'] *= 43560

df_Sold_Inflation_new=df_Sold_Inflation
df_Sold_Inflation_new

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.10,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.10,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.10,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.10,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2147,5469 NW Meadowlands Ter,Portland,97229,1/2/24,3.0,4,2592,6969.6,765000,3.09,5.33,1
2148,1837 NW Caitlin Ter,Portland,97229,12/29/23,3.0,4,2146,8712.0,700000,3.35,5.33,12
2149,11060 NW Cornell Rd,Portland,97229,12/29/23,2.0,3,2048,16117.2,574000,3.35,5.33,12
2150,200 NW 101st Ave,Portland,97229,12/29/23,3.0,3,2520,17859.6,600000,3.35,5.33,12


In [119]:
df_Sold_Inflation_new['Zip'] = df_Sold_Inflation_new['Zip'].astype(str)
df_Sold_Inflation_new

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.10,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.10,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.10,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.10,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2147,5469 NW Meadowlands Ter,Portland,97229,1/2/24,3.0,4,2592,6969.6,765000,3.09,5.33,1
2148,1837 NW Caitlin Ter,Portland,97229,12/29/23,3.0,4,2146,8712.0,700000,3.35,5.33,12
2149,11060 NW Cornell Rd,Portland,97229,12/29/23,2.0,3,2048,16117.2,574000,3.35,5.33,12
2150,200 NW 101st Ave,Portland,97229,12/29/23,3.0,3,2520,17859.6,600000,3.35,5.33,12


In [120]:
# Encode (convert to dummy variables) the EnergyType column
df_zip_dummies = pd.get_dummies(df_Sold_Inflation_new["Zip"])
df_month_dummies = pd.get_dummies(df_Sold_Inflation_new["MonthID"])

In [121]:
# Scaling the numeric columns
df_Sold_Inflation_scaled = StandardScaler().fit_transform(df_Sold_Inflation_new[["Bathrooms", "Bedrooms", "LivingArea","LandArea","PropPrice"]])

# Creating a DataFrame with with the scaled data
df_sold_transformed = pd.DataFrame(df_Sold_Inflation_scaled, columns=["Bathrooms", "Bedrooms", "LivingArea","LandArea","PropPrice"])

# # Display sample data
df_sold_transformed

Unnamed: 0,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice
0,-0.104313,0.014305,-0.255728,-0.132186,-0.406954
1,-0.540413,-0.613967,-0.222953,-0.133260,-0.382384
2,-0.540413,-0.613967,-0.251316,-0.118984,-0.383830
3,-0.540413,-0.613967,-0.250055,-0.124378,-0.381212
4,-0.540413,-0.613967,-0.233195,-0.108707,-0.371785
...,...,...,...,...,...
2147,-0.104313,0.014305,-0.047730,-0.147278,-0.328266
2148,-0.104313,0.014305,-0.118008,-0.100523,-0.338704
2149,-0.540413,-0.613967,-0.133451,0.098184,-0.358938
2150,-0.104313,-0.613967,-0.059076,0.144938,-0.354763


In [122]:
# Concatenate the df_zip_dummies and the df_month_dummies DataFrames
df_sold_transformed = pd.concat([df_sold_transformed, df_Sold_Inflation_new['Zip']], axis=1)

# Display sample data
df_sold_transformed

Unnamed: 0,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,Zip
0,-0.104313,0.014305,-0.255728,-0.132186,-0.406954,85225
1,-0.540413,-0.613967,-0.222953,-0.133260,-0.382384,85225
2,-0.540413,-0.613967,-0.251316,-0.118984,-0.383830,85225
3,-0.540413,-0.613967,-0.250055,-0.124378,-0.381212,85225
4,-0.540413,-0.613967,-0.233195,-0.108707,-0.371785,85225
...,...,...,...,...,...,...
2147,-0.104313,0.014305,-0.047730,-0.147278,-0.328266,97229
2148,-0.104313,0.014305,-0.118008,-0.100523,-0.338704,97229
2149,-0.540413,-0.613967,-0.133451,0.098184,-0.358938,97229
2150,-0.104313,-0.613967,-0.059076,0.144938,-0.354763,97229


In [123]:
#Split above dataframe by zip
df_Az=df_sold_transformed[df_sold_transformed['Zip']=='85225']
df_Chi=df_sold_transformed[df_sold_transformed['Zip']=='60629']
df_LA=df_sold_transformed[df_sold_transformed['Zip']=='90210']
df_Fl=df_sold_transformed[df_sold_transformed['Zip']=='33186']
df_NY=df_sold_transformed[df_sold_transformed['Zip']=='11368']
df_OR=df_sold_transformed[df_sold_transformed['Zip']=='97229']


In [124]:
#Split original dataframe (without transformations) by zip
df_Az_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='85225']
df_Chi_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='60629']
df_LA_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='90210']
df_Fl_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='33186']
df_NY_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='11368']
df_OR_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='97229']

df_Chi_Orig

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
138,3839 W 65th St,Chicago,60629,2/14/24,1.0,2,1053,4092.00,225000,3.10,5.33,2
139,5830 S Tripp Ave,Chicago,60629,2/12/24,3.0,6,1356,3750.00,249900,3.10,5.33,2
140,3708 W 56th Pl,Chicago,60629,2/9/24,2.0,4,934,4356.00,309000,3.10,5.33,2
141,6847 S Kolin Ave,Chicago,60629,2/9/24,1.0,3,1090,4687.06,270000,3.10,5.33,2
142,3854 W 59th St,Chicago,60629,2/7/24,3.0,3,1156,3124.99,318000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
732,6222 S California Ave,Chicago,60629,8/19/22,1.0,3,950,4375.00,192500,8.26,2.33,8
733,3814 W 69th St,Chicago,60629,8/19/22,1.0,2,864,4125.00,20000,8.26,2.33,8
734,6442 S Francisco Ave,Chicago,60629,8/19/22,1.5,4,1215,3659.00,130000,8.26,2.33,8
735,5514 S Sacramento Ave,Chicago,60629,8/18/22,3.0,7,3200,4142.56,359000,8.26,2.33,8


In [125]:
# Import the PCA module
# Import the modules
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [126]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=3)

In [127]:
# Fit the PCA model on the transformed credit card DataFrame
propertyChi_pca = pca.fit_transform(df_Chi)

# Review the first 5 rows of the array of list data
propertyChi_pca[:5]

array([[-1.18526090e+00, -8.88556674e-02,  1.10534458e-02],
       [ 1.46631975e+00,  9.42299353e-02, -1.07771740e-01],
       [ 1.38168656e-01, -8.90827640e-04, -9.02784129e-02],
       [-5.78988349e-01, -2.52990552e-01, -2.35308875e-03],
       [-3.53157918e-01,  5.85880907e-01, -8.02741750e-02]])

In [128]:
# Calculate the PCA explained variance ratio
sum(pca.explained_variance_ratio_), pca.explained_variance_ratio_

(0.9979865886405148, array([0.89581915, 0.0890509 , 0.01311654]))

In [129]:
# Create the PCA DataFrame
propertyChi_pca_df = pd.DataFrame(
    propertyChi_pca,
    columns=["PCA1", "PCA2","PCA3"]
)

# Review the PCA DataFrame
propertyChi_pca_df.head()

Unnamed: 0,PCA1,PCA2,PCA3
0,-1.185261,-0.088856,0.011053
1,1.46632,0.09423,-0.107772
2,0.138169,-0.000891,-0.090278
3,-0.578988,-0.252991,-0.002353
4,-0.353158,0.585881,-0.080274


In [130]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [131]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(propertyChi_pca_df)
    inertia.append(k_model.inertia_)

In [132]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,441.039936
1,2,179.095793
2,3,106.436069
3,4,79.928699
4,5,54.778131


In [133]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [135]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(propertyChi_pca_df)

# Make predictions
k_3= model.predict(propertyChi_pca_df)

# Create a copy of the customers_pca_df DataFrame
propertyChi_pca_predictions_df = propertyChi_pca_df.copy()

# Add a class column with the labels
propertyChi_pca_predictions_df["property_segments"] = k_3

In [136]:
# Plot the clusters
# propertyChi_pca_predictions_df.hvplot.scatter(
#     x="PCA1",
#     y="PCA2",
#     by="property_segments"
# )

import plotly.express as px

fig = px.scatter_3d(propertyChi_pca_predictions_df, x='PCA1', y='PCA2', z='PCA3',
              color='property_segments')
fig.show()

In [137]:
# Define the model Kmeans model using k=3 clusters
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_Chi)

# Make predictions
k_3 = model.predict(df_Chi)

# Create a copy of the customers_transformed_df DataFrame
df_Chi_predictions = df_Chi.copy()
df_Chi_Orig_Pred= df_Chi_Orig.copy()
# Add a class column with the labels
df_Chi_predictions["property_segments"] = k_3
df_Chi_Orig_Pred["property_segments"] = k_3
df_Chi_Orig_Pred

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID,property_segments
138,3839 W 65th St,Chicago,60629,2/14/24,1.0,2,1053,4092.00,225000,3.10,5.33,2,0
139,5830 S Tripp Ave,Chicago,60629,2/12/24,3.0,6,1356,3750.00,249900,3.10,5.33,2,1
140,3708 W 56th Pl,Chicago,60629,2/9/24,2.0,4,934,4356.00,309000,3.10,5.33,2,2
141,6847 S Kolin Ave,Chicago,60629,2/9/24,1.0,3,1090,4687.06,270000,3.10,5.33,2,0
142,3854 W 59th St,Chicago,60629,2/7/24,3.0,3,1156,3124.99,318000,3.10,5.33,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,6222 S California Ave,Chicago,60629,8/19/22,1.0,3,950,4375.00,192500,8.26,2.33,8,0
733,3814 W 69th St,Chicago,60629,8/19/22,1.0,2,864,4125.00,20000,8.26,2.33,8,0
734,6442 S Francisco Ave,Chicago,60629,8/19/22,1.5,4,1215,3659.00,130000,8.26,2.33,8,2
735,5514 S Sacramento Ave,Chicago,60629,8/18/22,3.0,7,3200,4142.56,359000,8.26,2.33,8,1


In [139]:
# Plot the clusters per 2 features from the LA dataframe against property price
df_Chi_Orig_Pred.hvplot.scatter(
    x="Bedrooms",
    y="PropPrice",
    by="property_segments",
    hover_cols=['Address', 'SoldDate']
)
#conclusion for this visual: Chicago homeowners are pretty price agnostic. They are purchasing based on number of bedrooms and home renovation status. 3out of the 4 pricier outliers were on the same street, newly constructed in 2022

In [142]:
import plotly.express as px

fig = px.scatter_3d(df_Chi_Orig_Pred, x='Bedrooms', y='PropPrice', z='LandArea',
              color='property_segments')
fig.show()