# <font color='lightgreen'>Clustering</font>

## Imports

In [None]:
import pandas as pd 
import numpy as np

#connect to drive
from google.colab import drive
# visualisation
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
from pprint import pprint
from scipy.special import softmax

# for dummies
from sklearn import preprocessing
# for scailng and standartization
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
# unbalanced data
from collections import Counter
from imblearn.over_sampling import ADASYN 
# for clustring - kmeans
from sklearn.cluster import KMeans
import sklearn
# for pca
from sklearn import decomposition
# for knn
from sklearn.neighbors import NearestNeighbors
# dbscan
from sklearn.cluster import DBSCAN


from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier


## Helper Functions

In [None]:
def opt_kmeans_k_silhouette(x,k_start,k_end):
  df_results = pd.DataFrame(columns=['K','silhouette_score','fotmated_score','algorithm'])
  algos=['full','elkan']
  for algo in algos: #for each model option
    for k in range(k_start, k_end): #for k in the search grid
      kmeans = KMeans(n_clusters=k,algorithm=algo).fit(x)
      labels=kmeans.labels_
      #calculate silhouette_score score
      score=sklearn.metrics.silhouette_score(x, labels, metric='euclidean') 
      #add current K's results to df
      new_row = {'K':k, 'silhouette_score':score,'fotmated_score':"%.2f" % score,'algorithm':algo} 
      df_results = df_results.append(new_row, ignore_index=True)
  #plot silhouette score per K
  fig = px.line(df_results, x="K", y="#add current K's results to df",color_discrete_sequence=px.colors.qualitative.Pastel, text="fotmated_score",color='algorithm',title='Kmeans-find the best K')
  fig.update_traces(textposition="bottom right")
  fig.show()

In [None]:
def present_eval_for_chosen_model(X,model_name,k=3,algo_method='elkan',epsilon=1,min_samples=10): # model_name = kmeans/dbscan
# fit data
  if model_name=='kmeans':
    model = KMeans(n_clusters=k,algorithm=algo_method).fit(X)
  else: # dbscan
    model = DBSCAN(eps=epsilon, min_samples=min_pts).fit(X)
  # get labels pf class
  labels=model.labels_
  # reduce dimension into 2 using PCA for visualisation
  pca = decomposition.PCA(n_components=2)
  pca.fit(X)
  pca_model = pca.transform(X)
  pca_model_df = pd.DataFrame(pca_model, columns = ['Column_A','Column_B'])
  pca_model_df['labels']=labels
  pca_model_df['labels_str']=pca_model_df['labels'].astype(str)
  # combine labels and X
  model_res=X
  model_res['label']=labels
  model_res['label']=model_res['label'].astype(str)

  # print split distrabution and score
  print("The split of the data between clusters: \n",model_res['label'].value_counts())
  db_score=sklearn.metrics.davies_bouldin_score(X, labels)
  print("Davis Bouldin score: ","%.4f" % db_score)
  # print scatter plot of pca 
  fig = px.scatter(pca_model_df, x="Column_A", y="Column_B", color="labels_str",title=model_name+' labels split (PCA dimensions)'
                  ,color_discrete_sequence=px.colors.qualitative.Pastel,width=800, height=1000)
  fig.show()
 # print scater plot of all combinations
  data_cols=model_res.loc[:, model_res.columns != 'label'].columns.to_list()
  fig2 = px.scatter_matrix(model_res, dimensions=data_cols,color='label',title=model_name+' labels split, matrix of all features'
                           ,color_discrete_sequence=px.colors.qualitative.Pastel,width=800, height=1000)
  fig2.show()

In [None]:
def find_the_best_epsilon_for_dbscan_knng(X,neighbors):
  nbrs = NearestNeighbors(n_neighbors=neighbors, algorithm='ball_tree').fit(X)
  distances = nbrs.kneighbors(X)
  #get distances
  knn_dist = pd.DataFrame(distances[0])
  #create cols list
  col_lst=[]
  for i in range(1,knn_dist.shape[1]): col_lst.append(i)
  #calc the avg distance per point and sort asc
  knn_dist['avg_distance']=knn_dist[col_lst].mean(axis=1)
  knn_dist=knn_dist.sort_values(by=['avg_distance']).reset_index()
  knn_dist['sorted_point'] = knn_dist.index
  knn_dist=knn_dist[['sorted_point','avg_distance']]
  # plot knng
  fig = px.line(knn_dist, x="sorted_point", y="avg_distance",color_discrete_sequence=px.colors.qualitative.Pastel, text="avg_distance",title='k-nearest neighbor graph, K='+str(neighbors))
  fig.update_traces(textposition="bottom right")
  fig.show()

In [None]:
def opt_params_dbscan_silhouette(X,search_grid_epsilon_lst,search_grid_min_pts_lst):
  search_geid_df = pd.DataFrame(list(zip(search_grid_min_pts_lst, search_grid_epsilon_lst)),columns =['min_pts', 'epsilon'])
  df_results = pd.DataFrame(columns=['params','silhouette_score','fotmated_score'])
  for index, row in search_geid_df.iterrows():
    dbscan = DBSCAN(eps=row['epsilon'], min_samples=row['min_pts']).fit(X)
    labels=dbscan.labels_
    score=sklearn.metrics.silhouette_score(X, labels, metric='euclidean')
    new_row = {'params':"epsilon="+str(row['epsilon'])+", +min_pts="+str(row['min_pts']),'silhouette_score':score,'fotmated_score':"%.2f" % score}
    df_results = df_results.append(new_row, ignore_index=True)
  fig3 = px.line(df_results, x="params", y="silhouette_score",color_discrete_sequence=px.colors.qualitative.Pastel, text="fotmated_score",title='DBSCAN-find the best params')
  fig3.update_traces(textposition="bottom right")
  fig3.show()

## Load data from google drive

In [None]:
drive.mount('/content/drive')
path = "/content/drive/MyDrive/data_files/clustering_data.csv"
raw_data_c = pd.read_csv(path)
raw_data_c=raw_data_c.drop(columns=['Unnamed: 0'])
raw_data_c=raw_data_c.rename(columns={"feature 1":"feature_1","feature 2":"feature_2","feature 3":"feature_3","feature 4":"feature_4","feature 5":"feature_5","feature 6":"feature_6","feature 7":"feature_7","feature 8":"feature_8","feature 9":"feature_9","feature 10":"feature_10"})

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print("Train shape: ",raw_data_c.shape)
raw_data_c.head()

Train shape:  (2500, 10)


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
0,-1.78422,9.139177,2.517077,-6.6913,3.332583,-6.189314,-8.294717,-5.527558,-9.150377,-3.437419
1,4.513056,-6.242134,-9.764999,8.975386,0.213133,-5.610271,6.194787,-0.247325,-1.000023,-9.091131
2,1.07657,-6.806062,-11.354346,11.589776,1.223487,-4.481565,5.119435,-0.036395,-2.30885,-5.820222
3,-0.203146,-6.098289,10.389102,-3.52788,-6.376361,3.444334,-0.331659,-3.78406,-6.922082,-3.463537
4,-4.488164,9.539276,0.414895,-3.298754,4.835677,-6.927819,-9.12349,-3.135282,-10.179017,-3.273429


## scale data

since the clustring methods are based on distance we need to make sure all our features are on the same scale

In [None]:
scaler2 = MinMaxScaler()
scaler2.fit(raw_data_c)
scaled2 = scaler2.transform(raw_data_c)
clustering_data = pd.DataFrame(scaled2, columns=raw_data_c.columns.to_list())

## feature selection - based on feature correlation

In [None]:
corr_matrix2 = clustering_data.corr()
print(corr_matrix2)

            feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
feature_1    1.000000  -0.208650  -0.764973   0.201233  -0.026168   0.361703   
feature_2   -0.208650   1.000000   0.235537  -0.670975   0.652018  -0.314174   
feature_3   -0.764973   0.235537   1.000000  -0.599773  -0.259167   0.174779   
feature_4    0.201233  -0.670975  -0.599773   1.000000  -0.049767  -0.318775   
feature_5   -0.026168   0.652018  -0.259167  -0.049767   1.000000  -0.626181   
feature_6    0.361703  -0.314174   0.174779  -0.318775  -0.626181   1.000000   
feature_7    0.891290  -0.468969  -0.800241   0.449403  -0.122889   0.357148   
feature_8    0.294168  -0.050696  -0.379975   0.301713   0.354999  -0.024913   
feature_9   -0.394945  -0.387142  -0.064906   0.685216   0.096788  -0.522700   
feature_10   0.476305   0.426279  -0.120283  -0.507082   0.218205   0.408591   

            feature_7  feature_8  feature_9  feature_10  
feature_1    0.891290   0.294168  -0.394945    0.476305  
fea

Correlated features:
feature_1&feature_3 0.76

feature_1&feature_7 0.89

feature_3&feature_7 0.8

Since feature_1 and feature_3 are highly correlated to feature_7, we can keep only feature_7 and remove feature_1 and feature_3 which.

In [None]:
clustering_data=clustering_data[['feature_2','feature_4','feature_5','feature_6','feature_7','feature_8','feature_9','feature_10']]

##  Kmeans

### optimization

In [None]:
opt_kmeans_k_silhouette(clustering_data,2,15) #searchgrid for the best K

- Best K is given by the best silhouette score (the highest scroe) - K=5
- No major differance between the algorithm methods, we will choose elkan

### evaluation

In [None]:
present_eval_for_chosen_model(clustering_data,'kmeans',k=5,algo_method='elkan') # model_name = kmeans/dbscan

The split of the data between clusters: 
 1    502
3    500
0    500
2    499
4    499
Name: label, dtype: int64
Davis Bouldin score:  0.2913


## DB scan

### optimization

In [None]:
clustering_data=clustering_data[['feature_2','feature_4','feature_5','feature_6','feature_7','feature_8','feature_9','feature_10']]

In [None]:
min_pts_search_lst=range((clustering_data.shape[1]*2)-4,(clustering_data.shape[1]*2)+5,2)
for min_pts in min_pts_search_lst:
  find_the_best_epsilon_for_dbscan_knng(clustering_data,min_pts)

using the knng we can find for given k the best epsilon - where we have using the elbow method

In [None]:
chosen_epsilons = [0.22, 0.214, 0.213, 0.22, 0.218] #matching epsilon the the k in the graphs
opt_params_dbscan_silhouette(clustering_data,chosen_epsilons,min_pts_search_lst) #search grid for the combination of min-points and epsilon for the best model

Best parameters are given by the best silhouette score (the highest scroe) - epsilon=0.218,min_points=20
(no major differance)

### evaluation

In [None]:
present_eval_for_chosen_model(clustering_data,'dbscan',epsilon=0.218,min_samples=20)

The split of the data between clusters: 
 1     500
2     500
0     499
3     499
4     499
-1      3
Name: label, dtype: int64
Davis Bouldin score:  0.4436


## comparison between Kmeans and DBSCAN

- In terms of number of clusters - both models have 5 clusters
- In terms of balanced clusters, we can see similar numbers (~1/5 of the data in each cluster)
- In terms of unlabled data points, DBSCAN has 3 unlabeled data points, from the visualisation it seems those points should have been clustered.
- **In terms of DB score: Kmeans has the better score (lower score) 0.2685 vs 0.4436**
- In terms of visualisation we can see a good separation between the clusters, in the DBSCAN, the separation between the close clusteres is a bit worse.

Overall, Kmeans has better performance for clustering the given dataset.