In [1]:
import numpy as np
from psyke import Extractor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import metrics
from psyke.clustering.classix.Classix_tuning import tuning_params_classix
import plotly.express as px


X_path="datasets\X_Glass.npy"
Y_path="datasets\Y_Glass.npy"

In [2]:
X=np.load(X_path)
Y=np.load(Y_path)

In [3]:
class_expected=np.unique(Y)
print("Number of expected class are:", class_expected)

Number of expected class are: [0 1 2 3 4 5]


In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

Classix works with merging based on distance

In [5]:
parameters = {'minPts':[0,5,10,15,20,25,30,35,40,50], 'radius':[0.1,0.2,0.3,0.5,0.75,1],'scale':[0.5,0.75,1,1.25,1.5,1.75,2]}

results=tuning_params_classix(X_train,X_test,Y_test,parameters,'distance')

In [6]:
results_list = [x[0] for x in results]
idx_best_configuration=np.argmax(results_list)
print("The best configuration in distance aggregation has ARI score is ",results[idx_best_configuration][0]," , minPts is: ",results[idx_best_configuration][1],' , radius is: ',results[idx_best_configuration][2], ', scale is: ',results[idx_best_configuration][3])

The best configuration in distance aggregation has ARI score is  0.3609341825902335  , minPts is:  0  , radius is:  0.5 , scale is:  2


In [7]:
minPts_best_distance=results[idx_best_configuration][1]
radius_best_distance=results[idx_best_configuration][2]
scale_best_distance=results[idx_best_configuration][3]

Classix works with merging based on density

In [8]:
parameters = {'minPts':[0,5,10,15,20,25,30,35,40,50], 'radius':[0.1,0.2,0.3,0.5,0.75,1],'scale':[0.5,0.75,1,1.25,1.5,1.75,2]}

results=tuning_params_classix(X_train,X_test,Y_test,parameters,'density')

In [9]:
results_list = [x[0] for x in results]
idx_best_configuration=np.argmax(results_list)
print("The best configuration in density aggregation has ARI score is: ",results[idx_best_configuration][0]," , minPts is: ",results[idx_best_configuration][1],' , radius is: ',results[idx_best_configuration][2],', scale is: ',results[idx_best_configuration][3])

The best configuration in density aggregation has ARI score is:  0.34710743801652894  , minPts is:  0  , radius is:  0.75 , scale is:  0.5


In [10]:
minPts_best_density=results[idx_best_configuration][1]
radius_best_density=results[idx_best_configuration][2]
scale_best_density=results[idx_best_configuration][3]

Plot results

In [11]:
df_test=pd.DataFrame(pd.concat([pd.DataFrame(X_test,columns=[0,1,2,3,4,5,6,7,8]),pd.DataFrame(Y_test,columns=[9])],axis=1))

In [12]:
fig = px.scatter_matrix(df_test,
    dimensions=[0,1,2,3,4,5,6,7,8],
    color=9)
fig.show()

In [13]:
classix_distance=Extractor.classix(group_merging_mode='distance',minPts=minPts_best_distance,radius=radius_best_distance,scale=scale_best_distance)
classix_distance.fit(X_train)
Y_pred_distance=classix_distance.predict(X_test)
df_pred_distance=pd.DataFrame(pd.concat([pd.DataFrame(X_test,columns=[0,1,2,3,4,5,6,7,8]),pd.DataFrame(Y_pred_distance,columns=[9])],axis=1))
fig = px.scatter_matrix(df_pred_distance,
    dimensions=[0,1,2,3,4,5,6,7,8],
    color=9)
fig.show()

Rand score on best configuration with merging based on distance

In [14]:
print(metrics.rand_score(Y_test,Y_pred_distance))

0.7331118493909191


In [15]:
classix_density=Extractor.classix(group_merging_mode='density',minPts=minPts_best_density,radius=radius_best_density,scale=scale_best_density)
classix_density.fit(X_train)
Y_pred_density=classix_density.predict(X_test)

df_pred_density=pd.DataFrame(pd.concat([pd.DataFrame(X_test,columns=[0,1,2,3,4,5,6,7,8]),pd.DataFrame(Y_pred_density,columns=[9])],axis=1))
fig = px.scatter_matrix(df_pred_density,
    dimensions=[0,1,2,3,4,5,6,7,8],
    color=9)
fig.show()

Rand score on best configuration with merging based on density

In [16]:
print(metrics.rand_score(Y_test,Y_pred_density))

0.7375415282392026


In [18]:
classix_density.explain()

	GeneralExplanation
A clustering of 171 data points with 9 features has been performed. 
The radius parameter was set to 0.75 and MinPts was set to 0. 
As the provided data has been scaled by a factor of 1/1.36, data points within a radius of R=0.75*1.36=1.02 were aggregated into groups. 
This resulted in 41 groups, each uniquely associated with a starting point. 
These 41 groups were subsequently merged into 30 clusters resulting in the following mapping groups --> cluster:
Groups [26, 28, 30, 31, 32, 33, 34, 35, 36] --> Cluster 0
Groups [13, 15, 18] --> Cluster 1
Groups [21, 22] --> Cluster 2
Groups [19] --> Cluster 3
Groups [7] --> Cluster 4
Groups [12] --> Cluster 5
Groups [10] --> Cluster 6
Groups [9] --> Cluster 7
Groups [40] --> Cluster 8
Groups [6] --> Cluster 9
Groups [14] --> Cluster 10
Groups [27] --> Cluster 11
Groups [2] --> Cluster 12
Groups [29] --> Cluster 13
Groups [37] --> Cluster 14
Groups [39] --> Cluster 15
Groups [38] --> Cluster 16
Groups [5] --> Cluster 17
Group

In [17]:
print('The configuration with aggregation on density has AMI score is: ',metrics.adjusted_mutual_info_score(Y_test,Y_pred_density),' , silhoutte score is: ',metrics.silhouette_score(X_test,Y_pred_density))  
print('The configuration with aggregation on distance has AMI score is: ',metrics.adjusted_mutual_info_score(Y_test,Y_pred_distance),' , silhoutte score is: ',metrics.silhouette_score(X_test,Y_pred_distance))  

The configuration with aggregation on density has AMI score is:  0.4741381141985289  , silhoutte score is:  0.19366479387508476
The configuration with aggregation on distance has AMI score is:  0.49563117326329115  , silhoutte score is:  0.3283018467215812
