In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
from sklearn import datasets

# Utility Functions

In [None]:
def load_data(filepath):
    return pd.read_csv(filepath)

def preprocess(df):
    #remove duplicates
    df = df.drop_duplicates()
    #remove null values
    df = df.dropna()
    return df

def scale_features(df,type="minmax"):
    if type == "minmax":
        scaler = MinMaxScaler() # or StandardScaler

        df = scaler.fit_transform(df)
        return df
    if type == "standard":
        scaler = StandardScaler()

        df = scaler.fit_transform(df)
        return df

def plot_data(df, x_col, y_col,c_var,title="Data Distribution"):
    plt.scatter(df[x_col], df[y_col],c=df[c_var],cmap='viridis')
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(title)
    plt.show()

def plot_elbow_method(df, features, k_range=(1, 10)):
    sse = []
    for k in range(*k_range):
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        km.fit(df[features])
        sse.append(km.inertia_)
    plt.plot(range(*k_range), sse, marker='o')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Sum of Squared Errors (SSE)')
    plt.title('Elbow Method for Optimal K')
    plt.show()

def perform_kmeans(df, features, n_clusters=3):
    km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df['cluster'] = km.fit_predict(df[features])
    return km, df

def plot_clusters(df, features, km):
    colors = plt.cm.rainbow(np.linspace(0, 1, km.n_clusters))
    for i in range(km.n_clusters):
        cluster_data = df[df['cluster'] == i]
        plt.scatter(cluster_data[features[0]], cluster_data[features[1]], color=colors[i], label=f'Cluster {i}')
    plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], color='purple', marker='*', label='Centroids')
    plt.xlabel(features[0])
    plt.ylabel(features[1])
    plt.legend()
    plt.title("Clustered Data")
    plt.show()


# Iris Data Analysis

In [None]:
iris = datasets.load_iris()
X = iris.data
y = pd.DataFrame(iris.target, columns=['Targets'])
iris_df = pd.DataFrame(X, columns=['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'])

In [None]:
iris_df.head()
# y.head()

In [None]:
# Fit K-Means clustering
clustering = KMeans(n_clusters=3, random_state=5).fit(iris_df)

# Relabel clusters to match the ground truth order
relabel = np.choose(clustering.labels_, [0, 1, 2]).astype(np.int64)

# Define color mapping for correct classifications
color_theme = np.array(['red', 'green', 'blue'])

# Identify misclassified points
misclassified = relabel != y['Targets']
colors_with_misclassification = np.where(misclassified, 'black', color_theme[relabel])

# Create the plots
plt.figure(figsize=(18, 5))

# Ground Truth Plot
plt.subplot(1, 3, 1)
plt.scatter(iris_df.Petal_Length, iris_df.Petal_Width, c=color_theme[y['Targets']], s=50)
plt.title('Ground Truth Classification')

# K-Means Clustering Plot
plt.subplot(1, 3, 2)
plt.scatter(iris_df.Petal_Length, iris_df.Petal_Width, c=color_theme[relabel], s=50)
plt.title('K-Means Classification')

# Misclassified Points Highlighted
plt.subplot(1, 3, 3)
plt.scatter(iris_df.Petal_Length, iris_df.Petal_Width, c=colors_with_misclassification, s=50)
plt.title('Misclassified Points Highlighted (Black)')

plt.show()

In [None]:
print(classification_report(y, relabel))

# Categorical Data Analysis

In [None]:
# Pipeline
df_raw = load_data('data/Categorical.csv')

#EDA:
df_raw.head()

# Plot
x='bill_length_mm'
y='bill_depth_mm'
color='flipper_length_mm' # possibly change plot data to handle categorical
x,y,color = 'Longitude', 'Latitude', 'OTHER'
plot_data(df_raw, x, y,color)

# Pick columns to focus on
#Prepare data
df_raw['continent_code'] = df_raw['continent'].astype('category').cat.codes
x,y,color = 'Longitude', 'Latitude', 'continent_code'
# possibly change plot data to handle categorical
# pick numerical features
features = [x,y,color] # or all numericals
# remove na and convert to numericals. 
preprocess(df_raw)


df_minmax=scale_features(df_raw, type='minmax')
df_standard=scale_features(df_raw, type='standard')
plot_data(df_raw, x, y,color)
plot_data(df_raw, x, y,color)

df=df_minmax
plot_elbow_method(df, features)

km=perform_kmeans(df, features, n_clusters=3)[0]
plot_clusters(df, features, km)

# Penguins Data Analysis

In [None]:
df_raw = load_data("../../../../Datasets/penguins.csv")

In [None]:
#EDA:
df_raw.head()

In [None]:
# Plot
x='bill_length_mm'
y='bill_depth_mm'
color='flipper_length_mm' # possibly change plot data to handle categorical
plot_data(df, x, y,color)

In [None]:
# Pick columns to focus on
#Prepare data
x='bill_length_mm'
y='bill_depth_mm'
color='flipper_length_mm' # possibly change plot data to handle categorical
# pick numerical features
features = ['bill_length_mm', 'bill_depth_mm']
# remove na and convert to numericals. 
preprocess(df_raw)

In [None]:
df=scale_features(df_raw, features)

In [None]:
plot_elbow_method(df, features)

In [None]:
km=perform_kmeans(df, features, n_clusters=3)[0]
plot_clusters(df, features, km)

# Income Data Analysis

In [None]:
df = load_data("data/income.csv")

# Pipeline
df_raw = load_data("../../../../Datasets/penguins.csv")

#EDA:
df_raw.head()

# Plot
x='bill_length_mm'
y='bill_depth_mm'
color='flipper_length_mm' # possibly change plot data to handle categorical
plot_data(df, x, y,color)

# Pick columns to focus on
#Prepare data
x='bill_length_mm'
y='bill_depth_mm'
color='flipper_length_mm' # possibly change plot data to handle categorical
# pick numerical features
features = ['bill_length_mm', 'bill_depth_mm']
# remove na and convert to numericals. 
preprocess(df_raw)

df=scale_features(df_raw, features)

plot_elbow_method(df, features)

km=perform_kmeans(df, features, n_clusters=3)[0]
plot_clusters(df, features, km)