In [1]:
import numpy as np
from psyke import Extractor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import metrics
from psyke.clustering.classix.Classix_tuning import tuning_params_classix
import plotly.express as px


X_path="datasets\X_Wine.npy"
Y_path="datasets\Y_Wine.npy"

In [2]:
X=np.load(X_path)
Y=np.load(Y_path)

In [3]:
class_expected=np.unique(Y)
print("Number of expected class are:", class_expected)

Number of expected class are: [1 2 3]


In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

Classix works with merging based on distance

In [5]:
parameters = {'minPts':[0,5,10,15,20,25,30,35,40,50], 'radius':[0.1,0.2,0.3,0.5,0.75,1],'scale':[0.5,0.75,1,1.25,1.5,1.75,2]}

results=tuning_params_classix(X_train,X_test,Y_test,parameters,'distance')

In [6]:
results_list = [x[0] for x in results]
idx_best_configuration=np.argmax(results_list)
print("The best configuration in distance aggregation has ARI score is ",results[idx_best_configuration][0]," , minPts is: ",results[idx_best_configuration][1],' , radius is: ',results[idx_best_configuration][2], ', scale is: ',results[idx_best_configuration][3])

The best configuration in distance aggregation has ARI score is  0.6470588235294118  , minPts is:  5  , radius is:  0.2 , scale is:  1.5


In [7]:
minPts_best_distance=results[idx_best_configuration][1]
radius_best_distance=results[idx_best_configuration][2]
scale_best_distance=results[idx_best_configuration][3]

Classix works with merging based on density

In [8]:
parameters = {'minPts':[0,5,10,15,20,25,30,35,40,50], 'radius':[0.1,0.2,0.3,0.5,0.75,1],'scale':[0.5,0.75,1,1.25,1.5,1.75,2]}

results=tuning_params_classix(X_train,X_test,Y_test,parameters,'density')

In [9]:
results_list = [x[0] for x in results]
idx_best_configuration=np.argmax(results_list)
print("The best configuration in density aggregation has ARI score is: ",results[idx_best_configuration][0]," , minPts is: ",results[idx_best_configuration][1],' , radius is: ',results[idx_best_configuration][2],', scale is: ',results[idx_best_configuration][3])

The best configuration in density aggregation has ARI score is:  0.4444444444444444  , minPts is:  10  , radius is:  0.2 , scale is:  0.5


In [10]:
minPts_best_density=results[idx_best_configuration][1]
radius_best_density=results[idx_best_configuration][2]
scale_best_density=results[idx_best_configuration][3]

Plot results

In [11]:
df_test=pd.DataFrame(pd.concat([pd.DataFrame(X_test,columns=[0,1,2,3,4,5,6,7,8,9,10,11,12]),pd.DataFrame(Y_test,columns=[13])],axis=1))

In [12]:
fig = px.scatter_matrix(df_test,
    dimensions=[0,1,2,3,4,5,6,7,8,9,10,11,12],
    color=13)
fig.show()

In [13]:
classix_distance=Extractor.classix(group_merging_mode='distance',minPts=minPts_best_distance,radius=radius_best_distance,scale=scale_best_distance)
classix_distance.fit(X_train)
Y_pred_distance=classix_distance.predict(X_test)
df_pred_distance=pd.DataFrame(pd.concat([pd.DataFrame(X_test,columns=[0,1,2,3,4,5,6,7,8,9,10,11,12]),pd.DataFrame(Y_pred_distance,columns=[13])],axis=1))
fig = px.scatter_matrix(df_pred_distance,
    dimensions=[0,1,2,3,4,5,6,7,8,9,10,11,12],
    color=13)
fig.show()

Rand score on best configuration with merging based on distance

In [14]:
print(metrics.rand_score(Y_test,Y_pred_distance))

0.8222222222222222


In [18]:
classix_distance.explain()

	GeneralExplanation
A clustering of 142 data points with 13 features has been performed. 
The radius parameter was set to 0.20 and MinPts was set to 5. 
As the provided data has been scaled by a factor of 1/230.52, data points within a radius of R=0.20*230.52=46.10 were aggregated into groups. 
This resulted in 26 groups, each uniquely associated with a starting point. 
These 26 groups were subsequently merged into 2 clusters resulting in the following mapping groups --> cluster:
Groups [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] --> Cluster 0
Groups [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] --> Cluster 1

Below the list of all the groups starting points (unscaled):

Starting point 0 has alpha score = -3.523 and coordinates [1.438e+01 1.870e+00 2.380e+00 1.200e+01 1.020e+02 3.300e+00 3.640e+00
 2.900e-01 2.960e+00 7.500e+00 1.200e+00 3.000e+00 1.547e+03]
Starting point 1 has alpha score = -3.233 and coordinates [1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.49

In [15]:
classix_density=Extractor.classix(group_merging_mode='density',minPts=minPts_best_density,radius=radius_best_density,scale=scale_best_density)
classix_density.fit(X_train)
Y_pred_density=classix_density.predict(X_test)

df_pred_density=pd.DataFrame(pd.concat([pd.DataFrame(X_test,columns=[0,1,2,3,4,5,6,7,8,9,10,11,12]),pd.DataFrame(Y_pred_density,columns=[13])],axis=1))
fig = px.scatter_matrix(df_pred_density,
    dimensions=[0,1,2,3,4,5,6,7,8,9,10,11,12],
    color=13)
fig.show()

Rand score on best configuration with merging based on density

In [16]:
print(metrics.rand_score(Y_test,Y_pred_density))

0.7142857142857143


In [17]:
print('The configuration with aggregation on density has AMI score is: ',metrics.adjusted_mutual_info_score(Y_test,Y_pred_density),' , silhoutte score is: ',metrics.silhouette_score(X_test,Y_pred_density))  
print('The configuration with aggregation on distance has AMI score is: ',metrics.adjusted_mutual_info_score(Y_test,Y_pred_distance),' , silhoutte score is: ',metrics.silhouette_score(X_test,Y_pred_distance))  

The configuration with aggregation on density has AMI score is:  0.543235279953808  , silhoutte score is:  0.6633553831614185
The configuration with aggregation on distance has AMI score is:  0.7611683049109831  , silhoutte score is:  0.622533325445458
