In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/cleanedData.csv')

In [5]:
df.columns

Index(['psyte_PSYTECategoryCode', 'psyte_PSYTEGroupCode', 'psyte_censusBlock',
       'psyte_censusBlockGroup', 'psyte_censusBlockPopulation',
       'psyte_censusBlockHouseholds', 'psyte_PSYTESegmentCode.description',
       'psyte_householdIncomeVariable.value',
       'psyte_householdIncomeVariable.description',
       'psyte_propertyValueVariable.value',
       'psyte_propertyValueVariable.description',
       'psyte_propertyTenureVariable.value',
       'psyte_propertyTenureVariable.description',
       'psyte_propertyTypeVariable.value',
       'psyte_propertyTypeVariable.description',
       'psyte_urbanRuralVariable.value',
       'psyte_urbanRuralVariable.description', 'coastal_preciselyID',
       'coastal_waterbodyName', 'coastal_nearestWaterbodyCounty',
       'coastal_nearestWaterbodyState', 'coastal_nearestWaterbodyAdjacentName',
       'coastal_nearestWaterbodyAdjacentType',
       'coastal_distanceToNearestCoastFeet',
       'coastal_nearestWaterbodyType.value',
       

In [9]:
def prepare_features(df):
    """
    Prepare features by selecting relevant columns and separating numerical and categorical columns.
    """
    # Remove ID columns and other unnecessary columns
    columns_to_drop = [
        'psyte_preciselyID', 'coastal_preciselyID', 'flood_preciselyID', 'flood_floodID',
        'PBKEY', 'ParcelID', 'BuildingID', 'Geometry', 'GEOID', 'FIPS',
        'ADD_NUMBER', 'STREETNAME', 'CITY', 'STATE', 'ZIPCODE', 'PLUS4'
    ]
    
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    
    # Separate numerical and categorical columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    
    return df, numerical_columns, categorical_columns

def create_preprocessing_pipeline(numerical_columns, categorical_columns):
    """
    Create a preprocessing pipeline that handles both numerical and categorical data.
    """
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_columns),
            ('cat', categorical_transformer, categorical_columns)
        ])
    
    return preprocessor

def perform_pca_analysis(X_transformed, n_components=0.95):
    """
    Perform PCA and return transformed data along with explained variance ratio.
    """
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_transformed)
    
    # Calculate cumulative explained variance
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    
    return X_pca, pca, cumulative_variance_ratio

def determine_optimal_clusters(X_pca, max_clusters=10):
    """
    Determine optimal number of clusters using elbow method and silhouette score.
    """
    inertias = []
    silhouette_scores = []
    
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_pca)
        inertias.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(X_pca, kmeans.labels_))
    
    return range(2, max_clusters + 1), inertias, silhouette_scores

def plot_cluster_analysis(n_clusters, inertias, silhouette_scores):
    """
    Plot elbow curve and silhouette scores.
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Elbow curve
    ax1.plot(n_clusters, inertias, 'bo-')
    ax1.set_xlabel('Number of Clusters (k)')
    ax1.set_ylabel('Inertia')
    ax1.set_title('Elbow Method')
    
    # Silhouette scores
    ax2.plot(n_clusters, silhouette_scores, 'ro-')
    ax2.set_xlabel('Number of Clusters (k)')
    ax2.set_ylabel('Silhouette Score')
    ax2.set_title('Silhouette Analysis')
    
    plt.tight_layout()
    return fig

def perform_clustering_analysis(df):
    """
    Main function to perform the complete clustering analysis.
    """
    # Prepare features
    df, numerical_columns, categorical_columns = prepare_features(df)
    
    # Create and fit preprocessing pipeline
    preprocessor = create_preprocessing_pipeline(numerical_columns, categorical_columns)
    X_transformed = preprocessor.fit_transform(df)
    
    # Perform PCA
    X_pca, pca, cumulative_variance_ratio = perform_pca_analysis(X_transformed)
    
    # Determine optimal number of clusters
    n_clusters, inertias, silhouette_scores = determine_optimal_clusters(X_pca)
    
    # Plot cluster analysis
    fig = plot_cluster_analysis(n_clusters, inertias, silhouette_scores)
    
    # Find optimal number of clusters
    optimal_k = n_clusters[np.argmax(silhouette_scores)]
    
    # Perform final clustering with optimal k
    final_kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    cluster_labels = final_kmeans.fit_predict(X_pca)
    
    return {
        'cluster_labels': cluster_labels,
        'pca': pca,
        'cumulative_variance_ratio': cumulative_variance_ratio,
        'optimal_k': optimal_k,
        'silhouette_scores': silhouette_scores,
        'preprocessor': preprocessor,
        'X_pca': X_pca
    }
# Perform clustering analysis
results = perform_clustering_analysis(df)

# Access results
cluster_labels = results['cluster_labels']
optimal_k = results['optimal_k']
X_pca = results['X_pca']

# Visualize first two PCA components with cluster labels
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis')
plt.colorbar(scatter)
plt.xlabel('First PCA Component')
plt.ylabel('Second PCA Component')
plt.title(f'Clustering Results (k={optimal_k})')
plt.show()

: 