In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering,DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
import warnings
from sklearn.model_selection import train_test_split
from first_approach import preprocess_data
from second_approach import processed_df
np.random.seed(42)
warnings.filterwarnings('ignore')


# Clustering Algorithms with Hyperparameter Research and 3D Visualization

In [5]:
data=processed_df
train, test = train_test_split(
   data, test_size=0.3, random_state=42
) 


print("Dataset Info:")
print(data.info())
print("\nDataset Description:")
print(data.describe())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9864 entries, 0 to 9863
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           9864 non-null   float64
 1   Administrative_Duration  9864 non-null   float64
 2   ProductRelated           9864 non-null   float64
 3   ProductRelated_Duration  9864 non-null   float64
 4   ExitRates                9864 non-null   float64
 5   Month                    9864 non-null   float64
 6   OperatingSystems         9864 non-null   float64
 7   Browser                  9864 non-null   float64
 8   Region                   9864 non-null   float64
 9   TrafficType              9864 non-null   float64
 10  VisitorType              9864 non-null   float64
 11  Weekend                  9864 non-null   float64
 12  Revenue                  9864 non-null   float64
dtypes: float64(13)
memory usage: 1001.9 KB
None

Dataset Description

In [None]:
def apply_pca(data, n_components=3):
    pca = PCA(n_components=n_components)
    data_pca = pca.fit_transform(data)
    return pd.DataFrame(data_pca, columns=[f'PC{i+1}' for i in range(n_components)])

data_pca = apply_pca(train)

## Data Preprocessing

In [None]:
def apply_pca(data, n_components=3):
    pca = PCA(n_components=n_components)
    data_pca = pca.fit_transform(data)
    return pd.DataFrame(data_pca, columns=[f'PC{i+1}' for i in range(n_components)])

data_pca = apply_pca(train)

In [7]:
data_pca

Unnamed: 0,PC1,PC2,PC3
0,-1.212880,-1.802703,0.711119
1,0.523544,0.760199,-0.848283
2,-0.688062,0.637907,-1.132569
3,-0.836235,-0.114541,-1.733480
4,0.671098,-1.245651,0.639613
...,...,...,...
6899,1.803100,1.372228,2.038628
6900,0.103510,-1.466053,0.832955
6901,-0.411104,-0.176370,-0.700667
6902,1.859598,0.016542,0.612038


## Hyperparameter Research


In [8]:

def evaluate_clustering(data, labels):
    score = silhouette_score(data, labels)
    return score


def kmeans_hyperparameter_research(data, param_grid):
    best_score = -1
    best_params = None
    for params in ParameterGrid(param_grid):
        kmeans = KMeans(**params, random_state=42)
        labels = kmeans.fit_predict(data)
        score = evaluate_clustering(data, labels)
        if score > best_score:
            best_score = score
            best_params = params
    return best_params, best_score


def hierarchical_hyperparameter_research(data, param_grid):
    best_score = -1
    best_params = None
    for params in ParameterGrid(param_grid):
        agg_clustering = AgglomerativeClustering(**params)
        labels = agg_clustering.fit_predict(data)
        score = evaluate_clustering(data, labels)
        if score > best_score:
            best_score = score
            best_params = params
    return best_params, best_score


def gmm_hyperparameter_research(data, param_grid):
    best_score = -1
    best_params = None
    for params in ParameterGrid(param_grid):
        gmm = GaussianMixture(**params, random_state=42)
        labels = gmm.fit_predict(data)
        score = evaluate_clustering(data, labels)
        if score > best_score:
            best_score = score
            best_params = params
    return best_params, best_score

def dbscan_hyperparameter_research(data, param_grid):
    best_score = -1
    best_params = None
    for params in ParameterGrid(param_grid):
        dbscan = DBSCAN(**params)
        labels = dbscan.fit_predict(data)
        # Silhouette score requires at least 2 clusters
        if len(set(labels)) > 1:
            score = evaluate_clustering(data, labels)
            if score > best_score:
                best_score = score
                best_params = params
    return best_params, best_score

In [9]:

kmeans_param_grid = {
    'n_clusters': [2, 3, 4, 5],
    'init': ['k-means++', 'random'],
    'n_init': [10, 20],
    'max_iter': [300, 600]
}

hierarchical_param_grid = {
    'n_clusters': [2, 3, 4, 5],
    'linkage': ['ward', 'complete', 'average', 'single']
}

gmm_param_grid = {
    'n_components': [2, 3, 4, 5],
    'covariance_type': ['full', 'tied', 'diag', 'spherical']
}

dbscan_param_grid = {
    'eps': [0.3, 0.5, 0.7],
    'min_samples': [5, 10, 15]
}

In [10]:
print("\nK-Means Hyperparameter Research:")
best_kmeans_params, best_kmeans_score = kmeans_hyperparameter_research(data_pca, kmeans_param_grid)
print(f"Best K-Means Params: {best_kmeans_params}, Best Score: {best_kmeans_score:.2f}")


K-Means Hyperparameter Research:


  File "c:\anaconda\lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


Best K-Means Params: {'init': 'random', 'max_iter': 300, 'n_clusters': 4, 'n_init': 10}, Best Score: 0.34


In [11]:
print("\nHierarchical Clustering Hyperparameter Research:")
best_hierarchical_params, best_hierarchical_score = hierarchical_hyperparameter_research(data_pca, hierarchical_param_grid)
print(f"Best Hierarchical Params: {best_hierarchical_params}, Best Score: {best_hierarchical_score:.2f}")


Hierarchical Clustering Hyperparameter Research:
Best Hierarchical Params: {'linkage': 'single', 'n_clusters': 2}, Best Score: 0.54


In [12]:

print("\nGMM Hyperparameter Research:")
best_gmm_params, best_gmm_score = gmm_hyperparameter_research(data_pca, gmm_param_grid)
print(f"Best GMM Params: {best_gmm_params}, Best Score: {best_gmm_score:.2f}")



GMM Hyperparameter Research:
Best GMM Params: {'covariance_type': 'spherical', 'n_components': 2}, Best Score: 0.37


In [13]:
print("\nDBSCAN Hyperparameter Research:")
best_dbscan_params, best_dbscan_score = dbscan_hyperparameter_research(data_pca, dbscan_param_grid)
print(f"Best DBSCAN Params: {best_dbscan_params}, Best Score: {best_dbscan_score:.2f}")


DBSCAN Hyperparameter Research:
Best DBSCAN Params: {'eps': 0.7, 'min_samples': 5}, Best Score: 0.44


In [14]:
data=processed_df
train, test = train_test_split(
   data, test_size=0.3, random_state=42
) 
def apply_pca(data, n_components=2):
    pca = PCA(n_components=n_components)
    data_pca = pca.fit_transform(data)
    return pd.DataFrame(data_pca, columns=[f'PC{i+1}' for i in range(n_components)])

data_pca = apply_pca(train)

print("\nK-Means Hyperparameter Research:")
best_kmeans_params, best_kmeans_score = kmeans_hyperparameter_research(data_pca, kmeans_param_grid)
print(f"Best K-Means Params: {best_kmeans_params}, Best Score: {best_kmeans_score:.2f}")

print("\nHierarchical Clustering Hyperparameter Research:")
best_hierarchical_params, best_hierarchical_score = hierarchical_hyperparameter_research(data_pca, hierarchical_param_grid)
print(f"Best Hierarchical Params: {best_hierarchical_params}, Best Score: {best_hierarchical_score:.2f}")

print("\nGMM Hyperparameter Research:")
best_gmm_params, best_gmm_score = gmm_hyperparameter_research(data_pca, gmm_param_grid)
print(f"Best GMM Params: {best_gmm_params}, Best Score: {best_gmm_score:.2f}")

print("\nDBSCAN Hyperparameter Research:")
best_dbscan_params, best_dbscan_score = dbscan_hyperparameter_research(data_pca, dbscan_param_grid)
print(f"Best DBSCAN Params: {best_dbscan_params}, Best Score: {best_dbscan_score:.2f}")


K-Means Hyperparameter Research:
Best K-Means Params: {'init': 'random', 'max_iter': 300, 'n_clusters': 2, 'n_init': 10}, Best Score: 0.39

Hierarchical Clustering Hyperparameter Research:
Best Hierarchical Params: {'linkage': 'single', 'n_clusters': 2}, Best Score: 0.50

GMM Hyperparameter Research:
Best GMM Params: {'covariance_type': 'tied', 'n_components': 2}, Best Score: 0.45

DBSCAN Hyperparameter Research:
Best DBSCAN Params: {'eps': 0.5, 'min_samples': 15}, Best Score: 0.55
