In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report

# Utility Functions

In [None]:
def load_data(filepath):
    return pd.read_csv(filepath)

def plot_data(df, x_col, y_col, title="Data Distribution"):
    plt.scatter(df[x_col], df[y_col])
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(title)
    plt.show()

def perform_kmeans(df, features, n_clusters=3):
    km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df['cluster'] = km.fit_predict(df[features])
    return km, df

def plot_clusters(df, features, km):
    colors = ['green', 'red', 'black']
    for i in range(km.n_clusters):
        cluster_data = df[df['cluster'] == i]
        plt.scatter(cluster_data[features[0]], cluster_data[features[1]], color=colors[i], label=f'Cluster {i}')
    plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], color='purple', marker='*', label='Centroids')
    plt.xlabel(features[0])
    plt.ylabel(features[1])
    plt.legend()
    plt.title("Clustered Data")
    plt.show()

def scale_features(df, features):
    scaler = MinMaxScaler()
    df[features] = scaler.fit_transform(df[features])
    return df

def plot_elbow_method(df, features, k_range=(1, 10)):
    sse = []
    for k in range(*k_range):
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        km.fit(df[features])
        sse.append(km.inertia_)
    plt.plot(range(*k_range), sse, marker='o')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Sum of Squared Errors (SSE)')
    plt.title('Elbow Method for Optimal K')
    plt.show()

# Explore Clustering for Multiple Elbow Numbers
def scatter_elbow(X, X_scaled, n_clusters):
    """
    Visualize clustering results for a given number of clusters.
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap='rainbow', s=50, alpha=0.7)
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='black', marker='X', s=200, label='Centroids')
    plt.title(f'Clustering with {n_clusters} Clusters')
    plt.xlabel('Feature 1 (Standardized)')
    plt.ylabel('Feature 2 (Standardized)')
    plt.legend()
    plt.show()

# Income Data Analysis

In [None]:
filepath = "../data/income.csv"
features = ['Age', 'Income($)']
df = load_data(filepath)
plot_data(df, 'Age', 'Income($)', title="Initial Data Distribution")
df = scale_features(df, features)
plot_data(df, 'Age', 'Income($)', title="Scaled Data Distribution")
km, df = perform_kmeans(df, features, n_clusters=3)
plot_clusters(df, features, km)
plot_elbow_method(df, features)

# Iris Data Analysis

In [None]:
filepath = "../data/income.csv"
iris = datasets.load_iris()
X = iris.data
y = pd.DataFrame(iris.target, columns=['Targets'])
clustering = KMeans(n_clusters=3, random_state=5).fit(X)
iris_df = pd.DataFrame(X, columns=['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'])
relabel = np.choose(clustering.labels_, [2, 0, 1]).astype(np.int64)
color_theme = np.array(['red', 'green', 'blue'])

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(iris_df.Petal_Length, iris_df.Petal_Width, c=color_theme[y['Targets']], s=50)
plt.title('Ground Truth Classification')
plt.subplot(1, 2, 2)
plt.scatter(iris_df.Petal_Length, iris_df.Petal_Width, c=color_theme[relabel], s=50)
plt.title('K-Means Classification')
plt.show()
print(classification_report(y, relabel))

# Categorical Data Analysis

In [None]:

df = pd.read_csv('data/Categorical.csv')
df['continent_code'] = df['continent'].astype('category').cat.codes
var1, var2, var3 = 'Longitude', 'Latitude', 'continent_code'
plt.figure(figsize=(8, 6))
plt.scatter(df[var1], df[var2], c=df[var3], cmap='rainbow')
plt.xlabel(var1)
plt.ylabel(var2)
plt.title("Initial Data Distribution")
plt.show()
df_scaled = StandardScaler().fit_transform(df[[var1, var2, var3]])
wcss = []
for i in range(2, 10):
    kmeans = KMeans(n_clusters=i, random_state=42).fit(df_scaled)
    wcss.append(kmeans.inertia_)
plt.plot(range(2, 10), wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('The Elbow Method')
plt.show()

# Penguins Data Analysis

In [None]:
df = pd.read_csv("../../../Datasets/penguins.csv")
features = ['bill_length_mm', 'bill_depth_mm']
X = df[features].dropna()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
plt.scatter(X['bill_length_mm'], X['bill_depth_mm'], alpha=0.6, c='gray')
plt.title('Initial Data Distribution')
plt.xlabel('Bill Length (mm)')
plt.ylabel('Bill Depth (mm)')
plt.show()
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_scaled)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=kmeans.labels_, cmap='viridis', s=50, alpha=0.7)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='X', s=200, label='Centroids')
plt.title('K-Means Clustering')
plt.xlabel('Feature 1 (Standardized)')
plt.ylabel('Feature 2 (Standardized)')
plt.legend()
plt.show()
wcss = [KMeans(n_clusters=k, random_state=42).fit(X_scaled).inertia_ for k in range(2, 10)]
plt.plot(range(2, 10), wcss, marker='o')
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares')
plt.show()


# Test Different Cluster Numbers
for n in [3, 4, 5, 6]:
    scatter_elbow(X, X_scaled, n_clusters=n)