In [1]:
import numpy as np
from psyke import Extractor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from psyke.clustering.classix.Classix_tuning import tuning_params_classix
import plotly.express as px


X_path="datasets\X_Banknote.npy"
Y_path="datasets\Y_Banknote.npy"

In [2]:
X=np.load(X_path)
Y=np.load(Y_path)

In [3]:
class_expected=np.unique(Y)
print("Number of expected class are:", class_expected)

Number of expected class are: [0 1]


In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

Classix works with merging based on distance

In [5]:
parameters = {'minPts':[0,5,10,15,20,25,30,35,40,50], 'radius':[0.1,0.2,0.3,0.5,0.75,1],'scale':[0.5,0.75,1,1.25,1.5,1.75,2]}

results=tuning_params_classix(X_train,X_test,Y_test,parameters,'distance')


In [6]:
results_list = [x[0] for x in results]
idx_best_configuration=np.argmax(results_list)
print("The best configuration in distance aggregation has ARI score is ",results[idx_best_configuration][0]," , minPts is: ",results[idx_best_configuration][1],' , radius is: ',results[idx_best_configuration][2], ', scale is: ',results[idx_best_configuration][3])

The best configuration in distance aggregation has ARI score is  0.6016677004318726  , minPts is:  35  , radius is:  0.2 , scale is:  1.5


In [7]:
minPts_best_distance=results[idx_best_configuration][1]
radius_best_distance=results[idx_best_configuration][2]
scale_best_distance=results[idx_best_configuration][3]

Classix works with merging based on density

In [8]:
parameters = {'minPts':[0,5,10,15,20,25,30,35,40,50], 'radius':[0.1,0.2,0.3,0.5,0.75,1],'scale':[0.5,0.75,1,1.25,1.5,1.75,2]}

results=tuning_params_classix(X_train,X_test,Y_test,parameters,'density')

In [9]:
results_list = [x[0] for x in results]
idx_best_configuration=np.argmax(results_list)
print("The best configuration in density aggregation has ARI score is: ",results[idx_best_configuration][0]," , minPts is: ",results[idx_best_configuration][1],' , radius is: ',results[idx_best_configuration][2],', scale is: ',results[idx_best_configuration][3])

The best configuration in density aggregation has ARI score is:  0.6635677671745724  , minPts is:  30  , radius is:  0.3 , scale is:  0.5


In [10]:
minPts_best_density=results[idx_best_configuration][1]
radius_best_density=results[idx_best_configuration][2]
scale_best_density=results[idx_best_configuration][3]

Plot results

In [11]:
df_test=pd.DataFrame(pd.concat([pd.DataFrame(X_test,columns=[0,1,2,3]),pd.DataFrame(Y_test,columns=[4])],axis=1))

In [12]:
fig = px.scatter_matrix(df_test,
    dimensions=[0,1,2,3],
    color=4)
fig.show()

In [13]:
classix_distance=Extractor.classix(group_merging_mode='distance',minPts=minPts_best_distance,radius=radius_best_distance,scale=scale_best_distance)
classix_distance.fit(X_train)
Y_pred_distance=classix_distance.predict(X_test)
df_pred_distance=pd.DataFrame(pd.concat([pd.DataFrame(X_test,columns=[0,1,2,3]),pd.DataFrame(Y_pred_distance,columns=[4])],axis=1))
fig = px.scatter_matrix(df_pred_distance,
    dimensions=[0,1,2,3],
    color=4)
fig.show()

Rand score on best configuration with merging based on distance

In [14]:
print(metrics.rand_score(Y_test,Y_pred_distance))

0.7954346383543464


In [15]:
classix_density=Extractor.classix(group_merging_mode='density',minPts=minPts_best_density,radius=radius_best_density,scale=scale_best_density)
classix_density.fit(X_train)
Y_pred_density=classix_density.predict(X_test)

df_pred_density=pd.DataFrame(pd.concat([pd.DataFrame(X_test,columns=[0,1,2,3]),pd.DataFrame(Y_pred_density,columns=[4])],axis=1))
fig = px.scatter_matrix(df_pred_density,
    dimensions=[0,1,2,3],
    color=4)
fig.show()

In [16]:
classix_density.explain()

	GeneralExplanation
A clustering of 1097 data points with 4 features has been performed. 
The radius parameter was set to 0.30 and MinPts was set to 30. 
As the provided data has been scaled by a factor of 1/6.48, data points within a radius of R=0.30*6.48=1.94 were aggregated into groups. 
This resulted in 125 groups, each uniquely associated with a starting point. 
These 125 groups were subsequently merged into 5 clusters resulting in the following mapping groups --> cluster:
Groups [25, 32, 34, 36, 38, 40, 42, 43, 45, 50, 51, 53, 54, 56, 58, 61, 64, 65, 67, 69, 70, 75, 76, 81, 82, 83, 84, 87, 90, 91, 92, 93, 95, 96, 97, 99, 101, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 117, 118, 120, 121] --> Cluster 0
Groups [30, 31, 35, 39, 41, 46, 47, 48, 49, 52, 55, 57, 59, 60, 62, 63, 66, 68, 71, 72, 73, 74, 77, 78, 79, 80, 85, 86, 88, 89, 94, 98, 100, 102, 103] --> Cluster 1
Groups [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 22, 23] --> Cluster 2
G

Rand score on best configuration with merging based on density

In [17]:
print(metrics.rand_score(Y_test,Y_pred_density))

0.7935235567352356


In [18]:
print('The configuration with aggregation on density has AMI score is: ',metrics.adjusted_mutual_info_score(Y_test,Y_pred_density),' , silhoutte score is: ',metrics.silhouette_score(X_test,Y_pred_density))  
print('The configuration with aggregation on distance has AMI score is: ',metrics.adjusted_mutual_info_score(Y_test,Y_pred_distance),' , silhoutte score is: ',metrics.silhouette_score(X_test,Y_pred_distance))  

The configuration with aggregation on density has AMI score is:  0.6427509263211951  , silhoutte score is:  0.15398742755626305
The configuration with aggregation on distance has AMI score is:  0.6305021143739697  , silhoutte score is:  0.24275482469164295
