In [48]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [49]:
# Read the CSV file from folder into a Pandas DataFrame
#Removed 1731 E Camino Ct property after previous analysis showed there was likely a typo in number of Bathrooms
file_path = Path("Sold_Inflation_M1_12.csv")
df_Sold_Inflation = pd.read_csv(file_path)


# Review the DataFrame
df_Sold_Inflation.head()

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.1,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.1,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.1,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.1,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.1,5.33,2


In [50]:
#convert LandArea reported in acres to sqft (anything less than a value of 50)
df_Sold_Inflation['LandArea'] = df_Sold_Inflation['LandArea'].astype(float)
for index, row in df_Sold_Inflation.iterrows():
    if row['LandArea'] < 50:
        df_Sold_Inflation.at[index, 'LandArea'] *= 43560

df_Sold_Inflation_new=df_Sold_Inflation
df_Sold_Inflation_new

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.10,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.10,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.10,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.10,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2146,5469 NW Meadowlands Ter,Portland,97229,1/2/24,3.0,4,2592,6969.6,765000,3.09,5.33,1
2147,1837 NW Caitlin Ter,Portland,97229,12/29/23,3.0,4,2146,8712.0,700000,3.35,5.33,12
2148,11060 NW Cornell Rd,Portland,97229,12/29/23,2.0,3,2048,16117.2,574000,3.35,5.33,12
2149,200 NW 101st Ave,Portland,97229,12/29/23,3.0,3,2520,17859.6,600000,3.35,5.33,12


In [51]:
df_Sold_Inflation_new['Zip'] = df_Sold_Inflation_new['Zip'].astype(str)
df_Sold_Inflation_new

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.10,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.10,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.10,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.10,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2146,5469 NW Meadowlands Ter,Portland,97229,1/2/24,3.0,4,2592,6969.6,765000,3.09,5.33,1
2147,1837 NW Caitlin Ter,Portland,97229,12/29/23,3.0,4,2146,8712.0,700000,3.35,5.33,12
2148,11060 NW Cornell Rd,Portland,97229,12/29/23,2.0,3,2048,16117.2,574000,3.35,5.33,12
2149,200 NW 101st Ave,Portland,97229,12/29/23,3.0,3,2520,17859.6,600000,3.35,5.33,12


In [52]:
# Encode (convert to dummy variables) the EnergyType column
df_zip_dummies = pd.get_dummies(df_Sold_Inflation_new["Zip"])
df_month_dummies = pd.get_dummies(df_Sold_Inflation_new["MonthID"])

In [53]:
# Scaling the numeric columns
df_Sold_Inflation_scaled = StandardScaler().fit_transform(df_Sold_Inflation_new[["Bathrooms", "Bedrooms", "LivingArea","LandArea","PropPrice"]])

# Creating a DataFrame with with the scaled data
df_sold_transformed = pd.DataFrame(df_Sold_Inflation_scaled, columns=["Bathrooms", "Bedrooms", "LivingArea","LandArea","PropPrice"])

# # Display sample data
df_sold_transformed

Unnamed: 0,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice
0,-0.102123,0.014309,-0.255756,-0.132224,-0.407042
1,-0.544332,-0.613817,-0.222988,-0.133298,-0.382477
2,-0.544332,-0.613817,-0.251345,-0.119025,-0.383922
3,-0.544332,-0.613817,-0.250085,-0.124418,-0.381305
4,-0.544332,-0.613817,-0.233228,-0.108751,-0.371880
...,...,...,...,...,...
2146,-0.102123,0.014309,-0.047805,-0.147312,-0.328370
2147,-0.102123,0.014309,-0.118067,-0.100568,-0.338806
2148,-0.544332,-0.613817,-0.133506,0.098093,-0.359036
2149,-0.102123,-0.613817,-0.059148,0.144837,-0.354861


In [54]:
# Concatenate the df_zip_dummies and the df_month_dummies DataFrames
df_sold_transformed = pd.concat([df_sold_transformed, df_Sold_Inflation_new['Zip']], axis=1)

# Display sample data
df_sold_transformed

Unnamed: 0,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,Zip
0,-0.102123,0.014309,-0.255756,-0.132224,-0.407042,85225
1,-0.544332,-0.613817,-0.222988,-0.133298,-0.382477,85225
2,-0.544332,-0.613817,-0.251345,-0.119025,-0.383922,85225
3,-0.544332,-0.613817,-0.250085,-0.124418,-0.381305,85225
4,-0.544332,-0.613817,-0.233228,-0.108751,-0.371880,85225
...,...,...,...,...,...,...
2146,-0.102123,0.014309,-0.047805,-0.147312,-0.328370,97229
2147,-0.102123,0.014309,-0.118067,-0.100568,-0.338806,97229
2148,-0.544332,-0.613817,-0.133506,0.098093,-0.359036,97229
2149,-0.102123,-0.613817,-0.059148,0.144837,-0.354861,97229


In [55]:
#Split above dataframe by zip
df_Az=df_sold_transformed[df_sold_transformed['Zip']=='85225']
df_Chi=df_sold_transformed[df_sold_transformed['Zip']=='60629']
df_LA=df_sold_transformed[df_sold_transformed['Zip']=='90210']
df_Fl=df_sold_transformed[df_sold_transformed['Zip']=='33186']
df_NY=df_sold_transformed[df_sold_transformed['Zip']=='11368']
df_OR=df_sold_transformed[df_sold_transformed['Zip']=='97229']


In [56]:
#Split original dataframe (without transformations) by zip
df_Az_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='85225']
df_Chi_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='60629']
df_LA_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='90210']
df_Fl_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='33186']
df_NY_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='11368']
df_OR_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='97229']
df_Az_Orig

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.10,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.10,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.10,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.10,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
132,1815 N Cheri Lynn Dr,Chandler,85225,11/6/23,2.0,3,1270,6604.0,380000,3.14,5.33,11
133,428 N Colorado St,Chandler,85225,11/6/23,2.0,3,1633,8002.0,380000,3.14,5.33,11
134,514 W Sundance Way,Chandler,85225,11/3/23,2.0,3,1288,8917.0,499990,3.14,5.33,11
135,703 W Greentree Dr,Chandler,85225,11/3/23,2.0,3,1336,5136.0,480500,3.14,5.33,11


In [57]:
# Import the PCA module
# Import the modules
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [58]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=3)

In [59]:
# Fit the PCA model on the transformed credit card DataFrame
propertyAZ_pca = pca.fit_transform(df_Az)

# Review the first 5 rows of the array of list data
propertyAZ_pca[:5]

array([[ 0.6211595 ,  0.08539818, -0.08106901],
       [-0.09685798, -0.12634886,  0.00821329],
       [-0.0995571 , -0.13061204,  0.01621078],
       [-0.09993023, -0.12963676,  0.01152969],
       [-0.09511486, -0.13090898,  0.03056462]])

In [60]:
# Calculate the PCA explained variance ratio
sum(pca.explained_variance_ratio_), pca.explained_variance_ratio_

(0.9897597370174681, array([0.71939141, 0.2204766 , 0.04989173]))

In [61]:
# Create the PCA DataFrame
propertyAZ_pca_df = pd.DataFrame(
    propertyAZ_pca,
    columns=["PCA1", "PCA2","PCA3"]
)

# Review the PCA DataFrame
propertyAZ_pca_df.head()

Unnamed: 0,PCA1,PCA2,PCA3
0,0.62116,0.085398,-0.081069
1,-0.096858,-0.126349,0.008213
2,-0.099557,-0.130612,0.016211
3,-0.09993,-0.129637,0.01153
4,-0.095115,-0.130909,0.030565


In [62]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [63]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(propertyAZ_pca_df)
    inertia.append(k_model.inertia_)

In [64]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,40.339075
1,2,20.725927
2,3,13.069963
3,4,8.942644
4,5,5.973651


In [65]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [66]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(propertyAZ_pca_df)

# Make predictions
k_3 = model.predict(propertyAZ_pca_df)

# Create a copy of the customers_pca_df DataFrame
propertyAZ_pca_predictions_df = propertyAZ_pca_df.copy()

# Add a class column with the labels
propertyAZ_pca_predictions_df["property_segments"] = k_3

In [67]:
# Plot the clusters
# propertyChi_pca_predictions_df.hvplot.scatter(
#     x="PCA1",
#     y="PCA2",
#     by="property_segments"
# )

import plotly.express as px

fig = px.scatter_3d(propertyAZ_pca_predictions_df, x='PCA1', y='PCA2', z='PCA3',
              color='property_segments')
fig.show()

In [68]:
# Define the model Kmeans model using k=3 clusters
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_Az)

# Make predictions
k_3 = model.predict(df_Az)

# Create a copy of the customers_transformed_df DataFrame
df_Az_predictions = df_Az.copy()
df_Az_Orig_Pred= df_Az_Orig.copy()
# Add a class column with the labels
df_Az_predictions["property_segments"] = k_3
df_Az_Orig_Pred["property_segments"] = k_3
df_Az_Orig_Pred.head()

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID,property_segments
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.1,5.33,2,0
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.1,5.33,2,1
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.1,5.33,2,1
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.1,5.33,2,1
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.1,5.33,2,1


In [70]:
# Plot the clusters per 2 features from the LA dataframe against property price
df_Az_Orig_Pred.hvplot.scatter(
    x="Bedrooms",
    y="PropPrice",
    by="property_segments",
    hover_cols=['Address', 'SoldDate']
)
#conclusion for this visual: General  

In [None]:
import plotly.express as px

fig = px.scatter_3d(df_OR_Orig_Pred, x='LandArea', y='PropPrice', z='Bedrooms',
              color='property_segments')
fig.show()