In [None]:
import math
from itertools import zip_longest

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
from sklearn.preprocessing import StandardScaler


sns.set_theme('notebook', 'whitegrid')


def multi_plots(df, fn, cols=3):
    df_cols = len(df.columns)
    rows = math.ceil(df_cols / cols)

    fig, axis = plt.subplots(rows, cols, figsize=(14, 12))
    for col, ax in zip_longest(df.columns, axis.ravel()):
        if col is not None:
            fn(df[col], ax)
        else:
            fig.delaxes(ax)
    plt.tight_layout()
    plt.show()

In [None]:
df = pd.read_csv("../input/wine-pca/Wine.csv")
display(df.head())
df.info(verbose=True, memory_usage=False)

In [None]:
df.drop(columns=["Customer_Segment"], inplace=True)

In [None]:
multi_plots(
    df,
    lambda col, ax: sns.histplot(data=col, ax=ax, kde=True)
)

In [None]:
multi_plots(
    df,
    lambda col, ax: sns.boxplot(x=col, ax=ax)
)

In [None]:
multi_plots(
    df,
    lambda col, ax: sns.scatterplot(x="Alcohol", y=col, data=df, ax=ax)
)

Pela plotagem acima podemos ver que o `Alcohol` possui uma certa relação com `Proline` e `Color_Intensity`.

In [None]:
corr = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap="Blues")
plt.show()

In [None]:

limite = 0.6
corr_filter = corr.abs().where(np.triu(np.ones(corr.shape), k=1).astype(bool)) > limite
cols = [c for c in corr_filter.columns if any(corr_filter[c])]
cols

In [None]:
df.drop(labels=cols, axis=1, inplace=True)


In [None]:
se = StandardScaler()

In [None]:
col_n = df.columns

In [None]:
se_ = se.fit_transform(df)

In [None]:
df_ = pd.DataFrame(se_, columns=col_n)

In [None]:
np.random.seed(42)

In [None]:
clusters = range(2, 16)

In [None]:
inertia = []
sil_score = []
calinski_score = []
davies_score = []

for i in clusters:
    kmeans_mod = KMeans(n_clusters=i, init="k-means++").fit(df_)
    inertia.append(kmeans_mod.inertia_)

    s_score = silhouette_score(df_, kmeans_mod.labels_)
    cal_score = calinski_harabasz_score(df_, kmeans_mod.labels_)
    dav_score = davies_bouldin_score(df_, kmeans_mod.labels_)

    sil_score.append(s_score)
    calinski_score.append(cal_score)
    davies_score.append(dav_score)

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(121)
plt.plot(clusters, inertia, marker='o', linestyle='--')
plt.xlabel("Clusters")
plt.ylabel("Inertia")
plt.title("Clusters v/s Inertia")

plt.subplot(122)
plt.plot(clusters, sil_score, marker='o', linestyle='--', color='r')
plt.xlabel("Clusters")
plt.ylabel("Silhouette Score")
plt.title("Clusters v/s Silhouette Score")

plt.tight_layout()

Pelo gráfico acima fica claro que o melhor valor para k será 3.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(121)
plt.plot(clusters, calinski_score, marker='o', linestyle='--', color='g')
plt.xlabel("Clusters")
plt.ylabel("Calsinki Score")
plt.title("Clusters vs Calinski Score")

plt.subplot(122)
plt.plot(clusters, davies_score, marker='o', linestyle='--', color='orange')
plt.xlabel("Clusters")
plt.ylabel("Davies Score")
plt.title("Clusters vs Davies Score")

plt.tight_layout()

In [None]:
k_model = KMeans(n_clusters=3, init="k-means++")

In [None]:
label_predict = k_model.fit_predict(df_)

In [None]:
centers = k_model.cluster_centers_
label_df = pd.DataFrame(label_predict, columns=["Label"])

df_ = pd.concat([df_, label_df], axis=1)
df_.head()

In [None]:
mapping = {0: 1, 1: 2, 2: 3}

df_["Label"] = df_["Label"].map(mapping)
display(df_['Label'].value_counts())

sns.countplot(data=df_['Label'])
plt.show()