In [79]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [80]:
# Read the CSV file from folder into a Pandas DataFrame
#Removed 11250 Northern Blvd and 9722 42nd after previous analysis showed there was likely a typo Land Area
file_path = Path("Sold_Inflation_M1_12.csv")
df_Sold_Inflation = pd.read_csv(file_path)


# Review the DataFrame
df_Sold_Inflation.head()

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.1,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.1,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.1,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.1,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.1,5.33,2


In [81]:
#convert LandArea reported in acres to sqft (anything less than a value of 50)
df_Sold_Inflation['LandArea'] = df_Sold_Inflation['LandArea'].astype(float)
for index, row in df_Sold_Inflation.iterrows():
    if row['LandArea'] < 50:
        df_Sold_Inflation.at[index, 'LandArea'] *= 43560

df_Sold_Inflation_new=df_Sold_Inflation
df_Sold_Inflation_new

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.10,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.10,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.10,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.10,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2144,5469 NW Meadowlands Ter,Portland,97229,1/2/24,3.0,4,2592,6969.6,765000,3.09,5.33,1
2145,1837 NW Caitlin Ter,Portland,97229,12/29/23,3.0,4,2146,8712.0,700000,3.35,5.33,12
2146,11060 NW Cornell Rd,Portland,97229,12/29/23,2.0,3,2048,16117.2,574000,3.35,5.33,12
2147,200 NW 101st Ave,Portland,97229,12/29/23,3.0,3,2520,17859.6,600000,3.35,5.33,12


In [82]:
df_Sold_Inflation_new['Zip'] = df_Sold_Inflation_new['Zip'].astype(str)
df_Sold_Inflation_new

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
0,244 N Delaware St,Chandler,85225,2/19/24,3.0,4,1272,7532.0,275000,3.10,5.33,2
1,923 W Mesquite St,Chandler,85225,2/16/24,2.0,3,1480,7492.0,428000,3.10,5.33,2
2,304 W El Prado Rd,Chandler,85225,2/16/24,2.0,3,1300,8024.0,419000,3.10,5.33,2
3,1412 E Ironwood Dr,Chandler,85225,2/16/24,2.0,3,1308,7823.0,435300,3.10,5.33,2
4,663 E Manor Dr,Chandler,85225,2/16/24,2.0,3,1415,8407.0,494000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2144,5469 NW Meadowlands Ter,Portland,97229,1/2/24,3.0,4,2592,6969.6,765000,3.09,5.33,1
2145,1837 NW Caitlin Ter,Portland,97229,12/29/23,3.0,4,2146,8712.0,700000,3.35,5.33,12
2146,11060 NW Cornell Rd,Portland,97229,12/29/23,2.0,3,2048,16117.2,574000,3.35,5.33,12
2147,200 NW 101st Ave,Portland,97229,12/29/23,3.0,3,2520,17859.6,600000,3.35,5.33,12


In [83]:
# Encode (convert to dummy variables) the EnergyType column
df_zip_dummies = pd.get_dummies(df_Sold_Inflation_new["Zip"])
df_month_dummies = pd.get_dummies(df_Sold_Inflation_new["MonthID"])

In [84]:
# Scaling the numeric columns
df_Sold_Inflation_scaled = StandardScaler().fit_transform(df_Sold_Inflation_new[["Bathrooms", "Bedrooms", "LivingArea","LandArea","PropPrice","InflationRate"]])

# Creating a DataFrame with with the scaled data
df_sold_transformed = pd.DataFrame(df_Sold_Inflation_scaled, columns=["Bathrooms", "Bedrooms", "LivingArea","LandArea","PropPrice","InflationRate"])

# # Display sample data
df_sold_transformed

Unnamed: 0,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate
0,-0.103040,0.013159,-0.519985,-0.150507,-0.407218,-1.042619
1,-0.545243,-0.615242,-0.447988,-0.151883,-0.382662,-1.042619
2,-0.545243,-0.615242,-0.510293,-0.133586,-0.384107,-1.042619
3,-0.545243,-0.615242,-0.507524,-0.140499,-0.381491,-1.042619
4,-0.545243,-0.615242,-0.470487,-0.120414,-0.372070,-1.042619
...,...,...,...,...,...,...
2144,-0.103040,0.013159,-0.063082,-0.169849,-0.328577,-1.047437
2145,-0.103040,0.013159,-0.217460,-0.109924,-0.339009,-0.922184
2146,-0.545243,-0.615242,-0.251381,0.144756,-0.359231,-0.922184
2147,-0.103040,-0.615242,-0.088004,0.204681,-0.355058,-0.922184


In [85]:
# Concatenate the df_zip_dummies and the df_month_dummies DataFrames
df_sold_transformed = pd.concat([df_sold_transformed, df_Sold_Inflation_new['Zip']], axis=1)

# Display sample data
df_sold_transformed

Unnamed: 0,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,Zip
0,-0.103040,0.013159,-0.519985,-0.150507,-0.407218,-1.042619,85225
1,-0.545243,-0.615242,-0.447988,-0.151883,-0.382662,-1.042619,85225
2,-0.545243,-0.615242,-0.510293,-0.133586,-0.384107,-1.042619,85225
3,-0.545243,-0.615242,-0.507524,-0.140499,-0.381491,-1.042619,85225
4,-0.545243,-0.615242,-0.470487,-0.120414,-0.372070,-1.042619,85225
...,...,...,...,...,...,...,...
2144,-0.103040,0.013159,-0.063082,-0.169849,-0.328577,-1.047437,97229
2145,-0.103040,0.013159,-0.217460,-0.109924,-0.339009,-0.922184,97229
2146,-0.545243,-0.615242,-0.251381,0.144756,-0.359231,-0.922184,97229
2147,-0.103040,-0.615242,-0.088004,0.204681,-0.355058,-0.922184,97229


In [86]:
#Split above dataframe by zip
df_Az=df_sold_transformed[df_sold_transformed['Zip']=='85225']
df_Chi=df_sold_transformed[df_sold_transformed['Zip']=='60629']
df_LA=df_sold_transformed[df_sold_transformed['Zip']=='90210']
df_Fl=df_sold_transformed[df_sold_transformed['Zip']=='33186']
df_NY=df_sold_transformed[df_sold_transformed['Zip']=='11368']
df_OR=df_sold_transformed[df_sold_transformed['Zip']=='97229']


In [87]:
#Split original dataframe (without transformations) by zip
df_Az_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='85225']
df_Chi_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='60629']
df_LA_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='90210']
df_Fl_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='33186']
df_NY_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='11368']
df_OR_Orig=df_Sold_Inflation_new[df_Sold_Inflation_new['Zip']=='97229']
df_LA_Orig

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID
736,1850 Coldwater Canyon Dr,Beverly Hills,90210,2/16/24,3.0,3,2408,10153.84,2055000,3.10,5.33,2
737,9618 Highland Gorge Dr,Beverly Hills,90210,2/16/24,2.0,2,1264,6655.00,1280000,3.10,5.33,2
738,9249 Burton Way UNIT 302,Beverly Hills,90210,2/15/24,3.0,2,1963,20908.80,1925000,3.10,5.33,2
739,441 N Oakhurst Dr APT 402,Beverly Hills,90210,2/14/24,3.0,3,2126,22651.20,1375000,3.10,5.33,2
740,1099 N Hillcrest Rd,Beverly Hills,90210,2/14/24,5.0,5,4777,20037.60,7200000,3.10,5.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1422,61 Beverly Park Ln,Beverly Hills,90210,9/29/21,13.0,10,22408,87555.60,3540000,5.39,0.08,9
1423,1244 Benedict Canyon Dr,Beverly Hills,90210,9/28/21,4.0,3,3281,15246.00,4300500,5.39,0.08,9
1424,203 N Rexford Dr,Beverly Hills,90210,9/27/21,3.0,4,2300,7562.00,3300000,5.39,0.08,9
1425,713 N Hillcrest Rd,Beverly Hills,90210,9/24/21,5.5,4,5514,27878.40,23000000,5.39,0.08,9


In [88]:
# Import the PCA module
# Import the modules
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [89]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=3)

In [90]:
# Fit the PCA model on the transformed credit card DataFrame
propertyNY_pca = pca.fit_transform(df_NY)

# Review the first 5 rows of the array of list data
propertyNY_pca[:5]

array([[ 0.9059872 ,  1.63304502,  1.19622817],
       [ 2.82501047,  1.48041673,  0.90005133],
       [ 0.15617311,  1.69538721, -0.55161314],
       [-2.18232264,  1.88744328,  0.37020425],
       [-2.9658826 ,  1.95380653,  0.17493433]])

In [91]:
# Calculate the PCA explained variance ratio
sum(pca.explained_variance_ratio_), pca.explained_variance_ratio_

(0.9825512487409791, array([0.63314788, 0.27940737, 0.069996  ]))

In [92]:
# Create the PCA DataFrame
propertyNY_pca_df = pd.DataFrame(
    propertyNY_pca,
    columns=["PCA1", "PCA2","PCA3"]
)

# Review the PCA DataFrame
propertyNY_pca_df.head()

Unnamed: 0,PCA1,PCA2,PCA3
0,0.905987,1.633045,1.196228
1,2.82501,1.480417,0.900051
2,0.156173,1.695387,-0.551613
3,-2.182323,1.887443,0.370204
4,-2.965883,1.953807,0.174934


In [93]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [94]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(propertyNY_pca_df)
    inertia.append(k_model.inertia_)

In [95]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,789.061666
1,2,469.82979
2,3,343.264056
3,4,238.463752
4,5,195.703699


In [96]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [98]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(propertyNY_pca_df)

# Make predictions
k_4 = model.predict(propertyNY_pca_df)

# Create a copy of the customers_pca_df DataFrame
propertyNY_pca_predictions_df = propertyNY_pca_df.copy()

# Add a class column with the labels
propertyNY_pca_predictions_df["property_segments"] = k_4

In [99]:
# Plot the clusters
# propertyChi_pca_predictions_df.hvplot.scatter(
#     x="PCA1",
#     y="PCA2",
#     by="property_segments"
# )

import plotly.express as px

fig = px.scatter_3d(propertyNY_pca_predictions_df, x='PCA1', y='PCA2', z='PCA3',
              color='property_segments')
fig.show()

In [100]:
# Define the model Kmeans model using k=3 clusters
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(df_NY)

# Make predictions
k_4 = model.predict(df_NY)

# Create a copy of the customers_transformed_df DataFrame
df_NY_predictions = df_NY.copy()
df_NY_Orig_Pred= df_NY_Orig.copy()
# Add a class column with the labels
df_NY_predictions["property_segments"] = k_4
df_NY_Orig_Pred["property_segments"] = k_4
df_NY_Orig_Pred.head()

Unnamed: 0,Address,City,Zip,SoldDate,Bathrooms,Bedrooms,LivingArea,LandArea,PropPrice,InflationRate,FederalInt,MonthID,property_segments
1881,58-37 Waldron Street,Corona,11368,2/15/24,6.0,6,3980,3000.0,1115000,3.1,5.33,2,2
1882,110-44 Corona Avenue,Flushing,11368,2/12/24,7.0,9,4590,1920.0,1100000,3.1,5.33,2,1
1883,108-34 50th Avenue,Flushing,11368,2/6/24,2.0,6,1890,2500.0,998000,3.1,5.33,2,2
1884,10440 Alstyne Ave,Flushing,11368,1/26/24,2.0,2,1162,2450.0,870000,3.09,5.33,1,3
1885,99-02 Christie Avenue UNIT 1A,Flushing,11368,1/10/24,1.0,1,597,33976.8,325000,3.09,5.33,1,3


In [106]:
# Plot the clusters per 2 features from the LA dataframe against property price
df_NY_Orig_Pred.hvplot.scatter(
    x="Bedrooms",
    y="PropPrice",
    by="property_segments",
    hover_cols=['Address', 'SoldDate']
)
#conclusion for this visual: General  

In [103]:
import plotly.express as px

fig = px.scatter_3d(df_NY_Orig_Pred, x='InflationRate', y='PropPrice', z='Bedrooms',
              color='property_segments')
fig.show()