In [2]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [3]:
# Load the dataset
file_path = "formatted_housedata.csv"
housedata_df = pd.read_csv(file_path,index_col=0)
housedata_df.head(10)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,SoldAge,RenovatedAge,city_rank
0,221900,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,59,2014,13.0
1,538000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,63,23,13.0
2,180000,2,1.0,770,10000,1.0,0,0,3,6,770,0,82,2015,14.0
3,604000,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,49,2014,13.0
4,510000,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,28,2015,4.0
5,1230000,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,13,2014,5.0
6,257500,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,19,2014,24.0
7,291850,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,52,2015,13.0
8,229500,3,1.0,1780,7470,1.0,0,0,3,7,1050,730,55,2015,13.0
9,323000,3,2.5,1890,6560,2.0,0,0,3,7,1890,0,12,2015,20.0


In [46]:
# Calculate Pearsons Correlation coefficient
housedata_corr_full = housedata_df.corr(method = "pearson")
housedata_corr = housedata_corr_full[["price"]].copy()
housedata_corr["price"] = abs(housedata_corr["price"])
housedata_corr = housedata_corr.drop(index="price")
housedata_corr = housedata_corr.sort_values(by=['price'],ascending=False)
# Select features that have correlation greater that 0.5
sel_feature_pearsons = housedata_corr[housedata_corr["price"]>0.65].index.values.tolist()
sel_feature_pearsons

['sqft_living', 'grade']

In [47]:
# Create X datasets 
X = housedata_df[sel_feature_pearsons]

In [48]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[-0.98213508, -0.56355038],
       [ 0.53017277, -0.56355038],
       [-1.4282115 , -1.41517294],
       ...,
       [-1.15621368, -0.56355038],
       [-0.52517875,  0.28807218],
       [-1.15621368, -0.56355038]])

In [49]:
# # Using PCA to reduce dimension to three principal components.
# # Initialize PCA model
# pca = PCA(n_components=3)
# # Get three principal components for the data.
# housedata_pca = pca.fit_transform(X_scaled)
# housedata_pca

In [27]:
# # Create a DataFrame with the three principal components.
# pcs_df = pd.DataFrame(data = housedata_pca,columns = ["PC1","PC2","PC3"],index=housedata_df.index)
# pcs_df.head(10)

Unnamed: 0,PC1,PC2,PC3
0,-1.981746,-0.219051,-0.335338
1,0.14973,-0.080047,0.108913
2,-3.164777,0.040855,-0.498105
3,0.234337,1.254805,0.808079
4,-0.285303,-0.587613,-1.557068
5,6.348546,0.4555,-0.234809
6,-1.014019,-0.650793,1.484383
7,-1.828758,-0.21043,-0.254126
8,-1.416258,1.075874,-0.047947
9,-0.580878,-0.715939,0.97024


In [50]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
#     km.fit(pcs_df)
    km.fit(X_scaled)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [51]:
# Initialize the K-Means model.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
# model.fit(pcs_df)
model.fit(X_scaled)


# Predict clusters
predictions = model.predict(X_scaled)
predictions

array([0, 2, 0, ..., 0, 2, 0])

In [52]:
# Create a new DataFrame including predicted clusters and housefeatures features.
# clustered_df = pd.concat([housedata_df,pcs_df],axis=1, join='inner').reindex(housedata_df.index)

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["Class"] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(21436, 19)


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,SoldAge,RenovatedAge,city_rank,PC1,PC2,PC3,Class
0,221900,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,59,2014,13.0,-1.981746,-0.219051,-0.335338,0
1,538000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,63,23,13.0,0.14973,-0.080047,0.108913,2
2,180000,2,1.0,770,10000,1.0,0,0,3,6,770,0,82,2015,14.0,-3.164777,0.040855,-0.498105,0
3,604000,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,49,2014,13.0,0.234337,1.254805,0.808079,0
4,510000,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,28,2015,4.0,-0.285303,-0.587613,-1.557068,2
5,1230000,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,13,2014,5.0,6.348546,0.4555,-0.234809,1
6,257500,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,19,2014,24.0,-1.014019,-0.650793,1.484383,0
7,291850,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,52,2015,13.0,-1.828758,-0.21043,-0.254126,0
8,229500,3,1.0,1780,7470,1.0,0,0,3,7,1050,730,55,2015,13.0,-1.416258,1.075874,-0.047947,0
9,323000,3,2.5,1890,6560,2.0,0,0,3,7,1890,0,12,2015,20.0,-0.580878,-0.715939,0.97024,0


In [66]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="sqft_living",
    y="grade",
    z="price",
    custom_data=['bedrooms', 'sqft_living', 'bathrooms', 'floors', 'city_rank', 'Class'],
    color="Class",
    symbol="Class",


)
fig.update_traces(hovertemplate='Cluster: %{customdata[5]} <br>Price: $%{z:,.2f} <br>Sqft_Living: %{x} sqft <br>Bedrooms: %{customdata[0]} <br>grade: %{y:,f} <br>Bathrooms: %{customdata[2]} <br>Floors: %{customdata[3]} <br>City Rank: %{customdata[4]}')
fig.update_layout(title='House Clusters based on Sqft Living & Grade', legend=dict(x=0, y=1), margin=dict(l=0, r=0, b=0, t=0), scene=dict(xaxis_title="SQFT Living", yaxis_title="Grade", zaxis_title="Price"))
fig.update_layout(
    title={
        'y':0.98,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()


In [67]:
# Write 3D-Scatter to an HTML file
fig.write_html('static/styles/chart.html',
                full_html=False,
                include_plotlyjs='cdn')