# K-Means Clustering (scrapped for PCA)



## Imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import os

## Examples of metrics that could be used



In [None]:
metrics = [
    'walk_score',
    'price', # FAIRMARKETTOTAL
    'medical_proximity',
    'grocery_proximity',
    'park_proximity',
    'entertainment_proximity',
    'is_senior_living'
]

## DataFrame with Necessary Data and Features

In [23]:
# Get csv file path
BASE_DIR = os.path.dirname(os.getcwd())
cluster_csv = os.path.join(BASE_DIR, 'backend', 'datasets', 'finalpt5.csv')

# Convert csv file to a DataFrame
df = pd.read_csv(cluster_csv)
df = df.set_index('PROPERTYADDRESS')

df = df[['STORIES', 'CONDITION', 'TOTALROOMS', 'FINISHEDLIVINGAREA']]
df.head()

Unnamed: 0_level_0,STORIES,CONDITION,TOTALROOMS,FINISHEDLIVINGAREA
PROPERTYADDRESS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
920 ROTHPLETZ ST,2.0,3.0,8.0,1460.0
3343 HARBISON ST,1.0,3.0,5.0,918.0
3547 CALIFORNIA AVE,2.0,3.0,10.0,4598.0
1325 GEYER AVE,2.0,3.0,7.0,1910.0
3457 MCCLURE AVE,2.0,3.0,6.0,1673.0


## K-Means Clustering

In [28]:
# Perform K-Means clustering
kmeans = KMeans(n_clusters=3).fit(df)

#TODO: Have DataFrame of predictive features, add cluster labels
df['cluster'] = kmeans.labels_

# Find averages of each feature that falls under each cluster to understand categories
cluster_summary = df.groupby('cluster').mean()
print(cluster_summary)

          STORIES  CONDITION  TOTALROOMS  FINISHEDLIVINGAREA
cluster                                                     
0        1.756759   3.259618    5.567609         1250.834295
1        2.099275   3.224451    7.339082         2170.721639
2        2.298679   3.134242    9.971234         3982.784573


## Map: Clusters -> Categories

In [29]:
# Mapping dictionary
# Example:
category_mapping = {
    0: 'Small',
    1: 'Medium',
    2: 'Large'
}

# Assign categoires based on determined cluster categories
df['category'] = df['cluster'].map(category_mapping)

# Remove cluster label column (numeric labels)
df = df.drop('cluster', axis=1)

## Export Categorized Data

In [31]:

# Create pkl file path
BASE_DIR = os.path.dirname(os.getcwd())
categorized_pkl = os.path.join(BASE_DIR, 'backend', 'datasets', 'output_categorized_data.pkl')

# Output DataFrame to a pkl file
df.to_csv(categorized_pkl)