In [3]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [49]:
# Read the CSV file from folder into a Pandas DataFrame
#Removed 7026 NW Penridge Rd property after previous analysis showed there was likely a typo in LandArea
file_path = Path("Sold_Inflation_M1_12.csv")
df_Sold_Inflation = pd.read_csv(file_path)


# Review the DataFrame
df_Sold_Inflation.head()

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.1,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.1,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.1,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.1,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.1,5.33,2


In [50]:
#convert LandArea reported in acres to sqft (anything less than a value of 50)
df_Sold_Inflation['LandArea'] = df_Sold_Inflation['LandArea'].astype(float)
for index, row in df_Sold_Inflation.iterrows():
    if row['LandArea'] < 50:
        df_Sold_Inflation.at[index, 'LandArea'] *= 43560

df_Sold_Inflation_new=df_Sold_Inflation
df_Sold_Inflation_new

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.10,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.10,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.10,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.10,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2147,5469 NW Meadowlands Ter,Portland,97229,1/2/24,3.0,4,2592,6969.6,765000,3.09,5.33,1
2148,1837 NW Caitlin Ter,Portland,97229,12/29/23,3.0,4,2146,8712.0,700000,3.35,5.33,12
2149,11060 NW Cornell Rd,Portland,97229,12/29/23,2.0,3,2048,16117.2,574000,3.35,5.33,12
2150,200 NW 101st Ave,Portland,97229,12/29/23,3.0,3,2520,17859.6,600000,3.35,5.33,12


In [51]:
df_Sold_Inflation_new['Zip'] = df_Sold_Inflation_new['Zip'].astype(str)
df_Sold_Inflation_new

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.10,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.10,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.10,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.10,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2147,5469 NW Meadowlands Ter,Portland,97229,1/2/24,3.0,4,2592,6969.6,765000,3.09,5.33,1
2148,1837 NW Caitlin Ter,Portland,97229,12/29/23,3.0,4,2146,8712.0,700000,3.35,5.33,12
2149,11060 NW Cornell Rd,Portland,97229,12/29/23,2.0,3,2048,16117.2,574000,3.35,5.33,12
2150,200 NW 101st Ave,Portland,97229,12/29/23,3.0,3,2520,17859.6,600000,3.35,5.33,12


In [52]:
# Encode (convert to dummy variables) the EnergyType column
df_zip_dummies = pd.get_dummies(df_Sold_Inflation_new["Zip"])
df_month_dummies = pd.get_dummies(df_Sold_Inflation_new["MonthID"])

In [53]:
# Scaling the numeric columns
df_Sold_Inflation_scaled = StandardScaler().fit_transform(df_Sold_Inflation_new[["Bathrooms", "Bedrooms", "LivingArea","LandArea","PropPrice"]])

# Creating a DataFrame with with the scaled data
df_sold_transformed = pd.DataFrame(df_Sold_Inflation_scaled, columns=["Bathrooms", "Bedrooms", "LivingArea","LandArea","PropPrice"])

# # Display sample data
df_sold_transformed

Unnamed: 0,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice
0,-0.104313,0.014305,-0.255728,-0.132186,-0.406954
1,-0.540413,-0.613967,-0.222953,-0.133260,-0.382384
2,-0.540413,-0.613967,-0.251316,-0.118984,-0.383830
3,-0.540413,-0.613967,-0.250055,-0.124378,-0.381212
4,-0.540413,-0.613967,-0.233195,-0.108707,-0.371785
...,...,...,...,...,...
2147,-0.104313,0.014305,-0.047730,-0.147278,-0.328266
2148,-0.104313,0.014305,-0.118008,-0.100523,-0.338704
2149,-0.540413,-0.613967,-0.133451,0.098184,-0.358938
2150,-0.104313,-0.613967,-0.059076,0.144938,-0.354763


In [54]:
# Concatenate the df_zip_dummies and the df_month_dummies DataFrames
df_sold_transformed = pd.concat([df_sold_transformed, df_Sold_Inflation_new['Zip'],df_month_dummies], axis=1)

# Display sample data
df_sold_transformed

Unnamed: 0,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,Zip,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.104313,0.014305,-0.255728,-0.132186,-0.406954,85225,0,1,0,0,0,0,0,0,0,0,0,0
1,-0.540413,-0.613967,-0.222953,-0.133260,-0.382384,85225,0,1,0,0,0,0,0,0,0,0,0,0
2,-0.540413,-0.613967,-0.251316,-0.118984,-0.383830,85225,0,1,0,0,0,0,0,0,0,0,0,0
3,-0.540413,-0.613967,-0.250055,-0.124378,-0.381212,85225,0,1,0,0,0,0,0,0,0,0,0,0
4,-0.540413,-0.613967,-0.233195,-0.108707,-0.371785,85225,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2147,-0.104313,0.014305,-0.047730,-0.147278,-0.328266,97229,1,0,0,0,0,0,0,0,0,0,0,0
2148,-0.104313,0.014305,-0.118008,-0.100523,-0.338704,97229,0,0,0,0,0,0,0,0,0,0,0,1
2149,-0.540413,-0.613967,-0.133451,0.098184,-0.358938,97229,0,0,0,0,0,0,0,0,0,0,0,1
2150,-0.104313,-0.613967,-0.059076,0.144938,-0.354763,97229,0,0,0,0,0,0,0,0,0,0,0,1


In [55]:
#Split above dataframe by zip
df_Az=df_sold_transformed[df_sold_transformed['Zip']=='85225']
df_Chi=df_sold_transformed[df_sold_transformed['Zip']=='60629']
df_LA=df_sold_transformed[df_sold_transformed['Zip']=='90210']
df_Fl=df_sold_transformed[df_sold_transformed['Zip']=='33186']
df_NY=df_sold_transformed[df_sold_transformed['Zip']=='11368']
df_OR=df_sold_transformed[df_sold_transformed['Zip']=='97229']


In [56]:
#Split original dataframe (without transformations) by zip
df_Az_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='85225']
df_Chi_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='60629']
df_LA_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='90210']
df_Fl_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='33186']
df_NY_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='11368']
df_OR_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='97229']

df_OR_Orig

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
2094,15842 NW Hackney Dr,Portland,97229,2/20/24,3.0,4,2134,6098.4,660000,3.1,5.33,2
2095,15883 NW Gooderham St #L-96,Portland,97229,2/20/24,5.0,6,4425,5709.0,1494955,3.1,5.33,2
2096,10771 NW Harding Ct,Portland,97229,2/16/24,3.0,3,1650,6534.0,610000,3.1,5.33,2
2097,8577 NW Ryan St,Portland,97229,2/16/24,3.0,4,2594,6534.0,770000,3.1,5.33,2
2098,5124 NW Crady Ln,Portland,97229,2/16/24,3.0,5,3275,5227.2,780000,3.1,5.33,2
2099,4578 NW Tamoshanter Way,Portland,97229,2/15/24,3.0,3,2432,7840.8,965000,3.1,5.33,2
2100,14412 NW Whistler Ln,Portland,97229,2/15/24,3.0,5,2835,6969.6,775000,3.1,5.33,2
2101,16790 NW Vetter Dr,Portland,97229,2/15/24,4.0,6,3976,6534.0,1130000,3.1,5.33,2
2102,7899 NW 162nd Ter #L106,Portland,97229,2/15/24,5.0,6,3581,5190.0,1375375,3.1,5.33,2
2103,7273 NW 167th Ave,Portland,97229,2/13/24,3.0,4,2252,3049.2,765000,3.1,5.33,2


In [57]:
# Import the PCA module
# Import the modules
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [58]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=3)

In [59]:
# Fit the PCA model on the transformed credit card DataFrame
propertyOR_pca = pca.fit_transform(df_OR)

# Review the first 5 rows of the array of list data
propertyOR_pca[:5]


Feature names only support names that are all strings. Got feature names with dtypes: ['int', 'str']. An error will be raised in 1.2.



array([[ 0.40026758,  0.82901969, -0.15465352],
       [ 1.7197743 ,  0.08772721, -0.10555977],
       [-0.12138804,  1.11617747, -0.20197763],
       [ 0.40889372,  0.82288146, -0.15356213],
       [ 0.93368142,  0.5332471 , -0.11015179]])

In [60]:
# Calculate the PCA explained variance ratio
sum(pca.explained_variance_ratio_), pca.explained_variance_ratio_

(0.9476328936080357, array([0.5646832 , 0.29943717, 0.08351252]))

In [61]:
# Create the PCA DataFrame
propertyOR_pca_df = pd.DataFrame(
    propertyOR_pca,
    columns=["PCA1", "PCA2","PCA3"]
)

# Review the PCA DataFrame
propertyOR_pca_df.head()

Unnamed: 0,PCA1,PCA2,PCA3
0,0.400268,0.82902,-0.154654
1,1.719774,0.087727,-0.10556
2,-0.121388,1.116177,-0.201978
3,0.408894,0.822881,-0.153562
4,0.933681,0.533247,-0.110152


In [62]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [63]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(propertyOR_pca_df)
    inertia.append(k_model.inertia_)

In [64]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,63.529568
1,2,37.000997
2,3,23.155657
3,4,16.198065
4,5,10.7023


In [65]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [66]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(propertyOR_pca_df)

# Make predictions
k_3 = model.predict(propertyOR_pca_df)

# Create a copy of the customers_pca_df DataFrame
propertyOR_pca_predictions_df = propertyOR_pca_df.copy()

# Add a class column with the labels
propertyOR_pca_predictions_df["property_segments"] = k_3

In [67]:
# Plot the clusters
# propertyChi_pca_predictions_df.hvplot.scatter(
#     x="PCA1",
#     y="PCA2",
#     by="property_segments"
# )

import plotly.express as px

fig = px.scatter_3d(propertyOR_pca_predictions_df, x='PCA1', y='PCA2', z='PCA3',
              color='property_segments')
fig.show()

In [68]:
# Define the model Kmeans model using k=3 clusters
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_OR)

# Make predictions
k_3 = model.predict(df_OR)

# Create a copy of the customers_transformed_df DataFrame
df_OR_predictions = df_OR.copy()
df_OR_Orig_Pred= df_OR_Orig.copy()
# Add a class column with the labels
df_OR_predictions["property_segments"] = k_3
df_OR_Orig_Pred["property_segments"] = k_3
df_OR_Orig_Pred.head()


Feature names only support names that are all strings. Got feature names with dtypes: ['int', 'str']. An error will be raised in 1.2.


Feature names only support names that are all strings. Got feature names with dtypes: ['int', 'str']. An error will be raised in 1.2.



Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID,property_segments
2094,15842 NW Hackney Dr,Portland,97229,2/20/24,3.0,4,2134,6098.4,660000,3.1,5.33,2,1
2095,15883 NW Gooderham St #L-96,Portland,97229,2/20/24,5.0,6,4425,5709.0,1494955,3.1,5.33,2,1
2096,10771 NW Harding Ct,Portland,97229,2/16/24,3.0,3,1650,6534.0,610000,3.1,5.33,2,1
2097,8577 NW Ryan St,Portland,97229,2/16/24,3.0,4,2594,6534.0,770000,3.1,5.33,2,1
2098,5124 NW Crady Ln,Portland,97229,2/16/24,3.0,5,3275,5227.2,780000,3.1,5.33,2,1


In [74]:
# Plot the clusters per 2 features from the LA dataframe against property price
df_OR_Orig_Pred.hvplot.scatter(
    x="MonthID",
    y="PropPrice",
    by="property_segments",
    hover_cols=['Address', 'SoldDate']
)
#conclusion for this visual: Certain cluster of Properties are more desireable in Feb   

In [44]:
import plotly.express as px

fig = px.scatter_3d(df_OR_Orig_Pred, x='LandArea', y='PropPrice', z='Bedrooms',
              color='property_segments')
fig.show()