In [None]:
import pandas as pd
import numpy as np
import pathlib as Path
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [None]:
df_pr = pd.read_excel("Resources/FullMLS_PublicRecordData/Public_Record_Official_12_5_22.xlsx")
df_pr

In [None]:
df_pr.info(verbose=True)

In [None]:
# Drop unnecessary columns
df_pr = df_pr.drop(columns=["OwnerNames", "PropertyCityState", "OwnerLastName", "OwnerFirstName", "Owner2LastName", "Owner2FirstName", "Owner3LastName", "Owner3FirstName", "Owner3LastName", "Owner3FirstName", "Owner4LastName", "Owner4FirstName","OwnerCareOf","OwnerDoNotMail", "QualCode" , "CensusTractBlock", "LandUse", "CountyLandDesc", "PoolType", "PropertyAddressFormatted"])
df_pr

In [None]:
# Determine the number of unique values in each column.
df_pr.nunique()

In [None]:
# Drop unnecessary columns
df_pr = df_pr.drop(columns=["YearRemod", "TaxYear", "OwnerZipCode", "OwnerZip4"])
df_pr

In [None]:
df_pr.info(verbose=True)

In [None]:
# Generate our categorical variable lists
obj_list = df_pr.dtypes[df_pr.dtypes == "object"].index.tolist()
obj_list

In [None]:
# Determine the number of unique values in each column.
df_pr.nunique()

In [None]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(data=df_pr, columns=['MLS Number',
 'CarrierRoute',
 'PropDoNotMail',
 'OwnerAddress',
 'OwnerCityState',
 'OwnerCarrierRoute',
 'Municipality',
 'SubdivisionNeighborhood',
 'TaxID',
 'TaxIDAlt',
 'Block',
 'Lot',
 'SchoolDistrict',
 'SaleType',
 'PropertyClass',
 'CondoYN',
 'LotShape',
 'Zoning',
 'Exterior',
 'BsmtDesc',
 'GrgType',
 'HeatDelivery'])
X.head(10)

In [None]:
# Standardize the data with StandardScaler().
df_pr = StandardScaler().fit_transform(X)
df_pr[0:5]

In [None]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=2)

pr_pca = pca.fit_transform(df_pr)

In [None]:
# Transform PCA data to a dataframe
df_pr_pca = pd.DataFrame(
    data=pr_pca, columns=["principal component 1", "principal component 2"]
)
df_pr_pca

In [None]:
# Fetch the explained variance
pca.explained_variance_ratio_

In [None]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pr_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [None]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_pr_pca)

# Predict clusters
predictions = model.predict(df_pr_pca)

# Add the predicted class columns
df_pr_pca["class"] = model.labels_
df_pr_pca.head()

In [None]:
df_pr_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)