In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import ParameterGrid
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler, PowerTransformer


In [5]:
df = pd.read_csv('1.csv', sep=',', header = 0, index_col=0)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
sns.pairplot(df)

In [None]:
sns.boxplot(df)

In [None]:
df = df.drop(columns=['Territorio','NC_   -  0'])

In [None]:
mms = MinMaxScaler()
pt = PowerTransformer()
df_columns = df.columns
df = pt.fit_transform(df)
df = mms.fit_transform(df)
df = pd.DataFrame(df, columns=df_columns)

# **KMEANS**

In [None]:
n_cluster = [*range(2,11)]
result_km = pd.DataFrame(columns=['n_cluster', 'inertia', 'silhouette_score'])

for n in n_cluster:
  cluster_km = KMeans(n_clusters=n)
  lbls_km = cluster_km.fit_predict(df)
  ss = silhouette_score(df,lbls_km)
  result_km.loc[len(result_km)] = [n, cluster_km.inertia_, ss]

In [None]:
result_km.sort_values(by=['silhouette_score'], ascending=False).head(4)

Unnamed: 0,n_cluster,inertia,silhouette_score
0,2.0,516.069638,0.54211
1,3.0,335.177357,0.460872
2,4.0,237.461182,0.430028
3,5.0,184.388937,0.408397


# **Agglomerative Clustering**

In [None]:
n_cluster = [*range(2,11)]
linkage = ['ward', 'complete', 'average', 'single']
result_ag = pd.DataFrame(columns=['n_cluster', 'linkage', 'silhouette_score'])

for n in n_cluster:
  for l in linkage:
    cluster_ag = AgglomerativeClustering(n_clusters=n, linkage=l)
    lbls_ag = cluster_ag.fit_predict(df)
    ss = silhouette_score(df,lbls_ag)
    result_ag.loc[len(result_ag)] = [n, l, ss]

In [None]:
result_ag.sort_values(by=['silhouette_score'], ascending=False).head(4)

Unnamed: 0,n_cluster,linkage,silhouette_score
0,2,ward,0.538138
3,2,single,0.538138
7,3,single,0.507631
6,3,average,0.485104


# **DBSACN**

In [None]:
eps = np.arange(0.3,1.5,0.1)
min_samples = [*range(5,20)]
params = {'eps': eps, 'min_samples': min_samples}
params = list(ParameterGrid(params))
result_db = pd.DataFrame(columns=['eps', 'min_samples', 'silhouette_score', 'unclust', 'n cluster'])
clusters_db = []

for i in params:
    cluster_db = DBSCAN(**i)
    lbls_db = cluster_db.fit_predict(df)
    if len(lbls_db[lbls_db != -1]) > 1 and len(lbls_db[lbls_db != -1]) < len(lbls_db):
      ss = silhouette_score(df,lbls_db)
      unclust = len(lbls_db[lbls_db == -1])/len(lbls_db) * 100
      n = np.unique(lbls_db[lbls_db != -1], return_counts=True)
      clusters_db.append(cluster_db)
      result_db.loc[len(result_db)] = [i['eps'], i['min_samples'], ss, unclust, len(n[0])]

In [None]:
result_db.sort_values(by=['silhouette_score'], ascending=False)

Unnamed: 0,eps,min_samples,silhouette_score,unclust,n cluster
0,0.3,16.0,0.677802,0.012673,1.0
1,0.3,17.0,0.677802,0.012673,1.0
2,0.3,18.0,0.677802,0.012673,1.0
3,0.3,19.0,0.677802,0.012673,1.0


In [None]:
idbest = result_db['silhouette_score'].idxmax()
lbl_best = clusters_db[idbest].fit_predict(df)
n = np.unique(lbl_best, return_counts=True)

(array([-1,  0]), array([   1, 7890]))