# Libraries


In [21]:
import pandas as pd
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pycountry
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import gower
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway


# Clustering

## Loading the PCA data and Gower Distance data

In [22]:
# Path settings csv file
csv_filepath = '../data/interim/pca.csv'

# Loading the csv file
data_pca = pd.read_csv(csv_filepath)
print("Data loaded successfully")

Data loaded successfully


In [23]:
# Path settings csv file
csv_filepath = '../data/interim/gower.csv'

# Loading the csv file
data_gower = pd.read_csv(csv_filepath)
print("Data loaded successfully")

Data loaded successfully


## Gridsearch Optimal Hyperparamters for PCA Data

In [24]:
start = 0.5
end = 30
step = 0.5
eps_values = np.arange(start, end, step)

In [25]:
start = 2
end = 50
step = 2
min_samples = np.arange(start, end, step)

In [26]:
df_analysis = pd.DataFrame(columns = ["eps_value", "min_sample", "score", "n_clusters", "cluster_types"])

for eps_value in eps_values:
    for min_sample in min_samples:
    
    # Perform DBSCAN clustering
        dbscan_cluster = DBSCAN(eps=eps_value, min_samples=min_sample, metric="euclidean")
        labels = dbscan_cluster.fit_predict(data_pca)
        n_clusters = len(set(labels))
        cluster_types = set(labels)
        try:
            score = metrics.silhouette_score(data_pca, labels)
        except ValueError:
            score = 0
        new_row = {'eps_value': eps_value, 'min_sample': min_sample, 'score' : score, 'n_clusters' : n_clusters, 'cluster_types': cluster_types}
        # Adding the new row without using append
        df_analysis.loc[len(df_analysis)] = new_row

In [27]:
# I want at least 3 Clusters so I query the for n > 2
df_analysis_3 = df_analysis.query("n_clusters > 2")

# Determine the best silhouette scores for clusters that are bigger than 2
best_clusters = df_analysis_3[df_analysis_3['score'] == df_analysis_3['score'].max()]
best_clusters

Unnamed: 0,eps_value,min_sample,score,n_clusters,cluster_types
136,3.0,34,0.105063,3,"{0, 1, -1}"


## Gridsearch Optimal Hyperparamters for Gower Distance

In [28]:
start = 0.1
end = 1
step = 0.02
eps_values = np.arange(start, end, step)

In [29]:
start = 2
end = 50
step = 2
min_samples = np.arange(start, end, step)

In [30]:
df_analysis_gower = pd.DataFrame(columns = ["eps_value", "min_sample", "score", "n_clusters", "cluster_types"])

for eps_value in eps_values:
    for min_sample in min_samples:
    
    # Perform DBSCAN clustering
        dbscan_cluster = DBSCAN(eps=eps_value, min_samples=min_sample, metric="precomputed")
        labels = dbscan_cluster.fit_predict(data_gower)
        n_clusters = len(set(labels))
        cluster_types = set(labels)
        try:
            score = metrics.silhouette_score(data_gower, labels)
        except ValueError:
            score = 0
        new_row = {'eps_value': eps_value, 'min_sample': min_sample, 'score' : score, 'n_clusters' : n_clusters, 'cluster_types': cluster_types}
        
        # Adding the new row without using append
        df_analysis_gower.loc[len(df_analysis_gower)] = new_row

In [31]:
# I want at least 3 Clusters so I query n > 2
df_analysis_gower_3 = df_analysis_gower.query("n_clusters > 2")

# Determine the best silhouette scores for clusters that are bigger than 2
best_clusters_gower = df_analysis_gower_3[df_analysis_gower_3['score'] == df_analysis_gower_3['score'].max()]
best_clusters_gower

Unnamed: 0,eps_value,min_sample,score,n_clusters,cluster_types
83,0.16,24,0.213477,3,"{0, 1, -1}"


## PCA Clustering

### Clustering with the set Hyperparameters PCA

In [32]:
# Best parameters
best_eps_value = 3.0
best_min_sample = 34

# Perform DBSCAN clustering with the best parameters
best_dbscan_cluster = DBSCAN(eps=best_eps_value, min_samples=best_min_sample, metric="euclidean")
cluster_labels = best_dbscan_cluster.fit_predict(data_pca)

# Find number of clusters and noise points
num_clusters = len(set(cluster_labels)) 
num_noise_points = list(cluster_labels).count(-1)

print("Number of clusters:", num_clusters)

# Calculate the number of data points in each cluster
unique_clusters, counts = np.unique(cluster_labels, return_counts=True)

# Output the number of data points in each cluster
for cluster, count in zip(unique_clusters, counts):
    if cluster == -1:
        print(f"Noise points: {count}")
    else:
        print(f"Cluster {cluster}: {count} data points")

Number of clusters: 3
Noise points: 525
Cluster 0: 555 data points
Cluster 1: 353 data points


## Gower Clustering

### Clustering with the set Hyperparameters Gower

In [33]:
# Best parameters
best_eps_value = 0.16
best_min_sample = 24

# Perform DBSCAN clustering with the best parameters
best_dbscan_cluster_gower = DBSCAN(eps=best_eps_value, min_samples=best_min_sample, metric="precomputed")
cluster_labels_gower = best_dbscan_cluster_gower.fit_predict(data_gower)

# Find number of clusters and noise points
num_clusters_gower = len(set(cluster_labels_gower)) 
num_noise_points_gower = list(cluster_labels_gower).count(-1)

print("Number of clusters:", num_clusters_gower)

# Calculate the number of data points in each cluster
unique_clusters_gower, counts_gower = np.unique(cluster_labels_gower, return_counts=True)

# Output the number of data points in each cluster
for cluster_gower, count_gower in zip(unique_clusters_gower, counts_gower):
    if cluster_gower == -1:
        print(f"Noise points: {count_gower}")
    else:
        print(f"Cluster {cluster_gower}: {count_gower} data points")

Number of clusters: 3
Noise points: 487
Cluster 0: 564 data points
Cluster 1: 382 data points


# Comparison between Gower Clusters and PCA Clusters

In [34]:
df_original_pca = pd.read_csv('../data/interim/final_df.csv')
df_original_pca['clusters'] = cluster_labels 

In [35]:
df_original = pd.read_csv('../data/interim/final_df.csv')
df_original['clusters'] = cluster_labels_gower
df_original.to_csv('../data/processed/df_analysis.csv', index=False)

In [36]:
df_original_0 = df_original.query("clusters == 0")
df_original_pca_0 = df_original_pca.query("clusters == 0")
df_original_1 = df_original.query("clusters == 1")
df_original_pca_1 = df_original_pca.query("clusters == 1")
df_original_n1 = df_original.query("clusters == -1")
df_original_pca_n1 = df_original_pca.query("clusters == -1")

In [37]:
def count_similar_values(list1, list2):
    # Convert lists to sets for efficient comparison
    set1 = set(list1)
    set2 = set(list2)
    
    # Calculate the intersection of the two sets
    intersection = set1.intersection(set2)
    
    # Return the number of elements in the intersection
    return len(intersection)

## Cluster 0

In [38]:
union_cluster_0 = count_similar_values(df_original_0.index, df_original_pca_0.index)
print(union_cluster_0 / len(df_original_0.index))
print(union_cluster_0 / len(df_original_pca_0.index))

0.925531914893617
0.9405405405405406


## Cluster 1

In [39]:
union_cluster_1 = count_similar_values(df_original_1.index, df_original_pca_1.index)
print(union_cluster_1 / len(df_original_1.index))
print(union_cluster_1 / len(df_original_pca_1.index))

0.7041884816753927
0.7620396600566572


## Cluster -1

In [40]:
union_cluster_n1 = count_similar_values(df_original_n1.index, df_original_pca_n1.index)
print(union_cluster_n1 / len(df_original_n1.index))
print(union_cluster_n1 / len(df_original_pca_n1.index))

0.7700205338809035
0.7142857142857143


## Conclusion

As we have seen, we achieve a similar result with a dimension reduction using PCA as well as with the help of Gower Distance and a subsequent clustering with DBSCAN. For the sake of simplicity, however, I will only use the results of the data set processed with Gower Distance for the further analysis of the clustering, as this provides a better silhouette score and therefore a better clustering. 