In [1]:
import numpy as np
from concurrent.futures import ProcessPoolExecutor, as_completed
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import umap


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder, MinMaxScaler
from sklearn.decomposition import PCA
import numpy as np
from sklearn.cluster import KMeans
from kneed import KneeLocator
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# import umap
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import umap.umap_ as umap
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
# from tqdm import tqdm 
# import multiprocessing as mp
import multiprocess as mp

In [3]:

class DimensionReductionOptimiser:
    def __init__(self, data):
        self.data = data
        self.pca_threshold = 0.95
        self.tsne_min_components = 2
        self.tsne_max_components = 3
        self.umap_min_components = 2
        self.umap_max_components = 10
          
    def pca_n_components_optimizer(self):
        print('  ** Selecting best N component for PCA')
        # Initialize PCA
        pca = PCA()

        # Fit PCA
        X_pca = pca.fit_transform(self.data)
        cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
        best_n_components = len(cumulative_variance[cumulative_variance <= self.pca_threshold])  # Select PCs covering 95% variance
        print(f'  ** best N component: {best_n_components}')
        return 'PCA', best_n_components

    def tsne_n_components_optimizer(self):
        print('  ** Selecting best N component for TSNE')
        
        best_n_components = None
        best_score = -1

        for n_components in range(self.tsne_min_components, self.tsne_max_components + 1):
            tsne = TSNE(n_components=n_components)
            transformed_data = tsne.fit_transform(self.data)
            
            # Example with KMeans clustering (can replace with other methods)
            kmeans = KMeans(n_clusters=4)
            kmeans.fit(transformed_data)
            labels = kmeans.labels_
            
            if len(set(labels)) > 1:
                score = silhouette_score(transformed_data, labels)
                if score > best_score:
                    best_score = score
                    best_n_components = n_components

        print(f'  ** best N component for TSNE: {best_n_components}')
        return 'TSNE', best_n_components

    def umap_n_components_optimizer(self):
        print('  ** Selecting best N component for UMAP')
        
        best_n_components = None
        best_score = -1

        for n_components in range(self.umap_min_components, self.umap_max_components + 1):
            reducer = umap.UMAP(n_components=n_components)
            reduced_data = reducer.fit_transform(self.data)
            
            # Example with KMeans clustering (can replace with other methods)
            kmeans = KMeans(n_clusters=4)
            kmeans.fit(reduced_data)
            labels = kmeans.labels_
            
            if len(set(labels)) > 1:
                score = silhouette_score(reduced_data, labels)
                if score > best_score:
                    best_score = score
                    best_n_components = n_components

        print(f'  ** best N component for UMAP: {best_n_components}')
        return 'UMAP', best_n_components
    
    def optimize_all(self):
        with ProcessPoolExecutor() as executor:
            # Submit tasks to the executor
            futures = {
                executor.submit(self.pca_n_components_optimizer): 'PCA',
                executor.submit(self.tsne_n_components_optimizer): 'TSNE',
                executor.submit(self.umap_n_components_optimizer): 'UMAP'
            }
            
            for future in as_completed(futures):
                method = futures[future]
                try:
                    result = future.result()
                    print(f'{method} result: {result}')
                except Exception as exc:
                    print(f'{method} generated an exception: {exc}')

# Example usage:
if __name__ == '__main__':
    # Replace this with your dataset
    data = np.random.rand(100, 50)  # Example dataset with 100 samples and 50 features

    # Initialize the optimizer class with data
    optimiser = DimensionReductionOptimiser(data)

    # Run optimizers in parallel
    optimiser.optimize_all()


PCA generated an exception: A process in the process pool was terminated abruptly while the future was running or pending.
TSNE generated an exception: A process in the process pool was terminated abruptly while the future was running or pending.
UMAP generated an exception: A process in the process pool was terminated abruptly while the future was running or pending.


In [12]:
     

# Class to fetch the best n_cluster parameter for the clustering models

class clustering_model_optimizer:
    def __init__(self, data):
        self.data = data
        self.min_clusters = 4
        self.max_clusters = 10
        
        
    def kmeans_n_cluster_optimizer(self):
        print('  *** Entered the elbow_plot method of the KMeansClustering class')
        wcss=[] # initializing an empty list
        for i in range (self.min_clusters,self.max_clusters+1):
            kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42) # initializing the KMeans object
            kmeans.fit(self.data) # fitting the data to the KMeans Algorithm
            wcss.append(kmeans.inertia_)
        kn = KneeLocator(range(self.min_clusters,self.max_clusters+1), wcss, curve='convex', direction='decreasing')
        self.best_N_cluster =  kn.knee    

    def agglomerative_spectural_n_cluster_optimizer(self,model_name):
        print(f"selecting best n_cluster parameter for model:{model_name}")
        score_dict = {'n_clusters':[],
                     'silhouette_score':[],
                     'DB_score':[]}

        for n_clusters in range( self.min_clusters, self.max_clusters + 1):
            if model_name == 'agglomerative':
                model = AgglomerativeClustering(n_clusters=n_clusters)
            elif model_name == 'spectral':
                print(n_clusters)
                model = SpectralClustering(n_clusters=n_clusters)
            else:
                raise ValueError('Unknown cluster model name')
                
            labels = model.fit_predict(self.data)
            silhouette_score_ = silhouette_score(self.data, labels)
            davies_bouldin_score_ = davies_bouldin_score(self.data, labels)
            score_dict['n_clusters'].append(n_clusters)
            score_dict['silhouette_score'].append(silhouette_score_)
            score_dict['DB_score'].append(davies_bouldin_score_)
        score_df = pd.DataFrame(score_dict).sort_values(['silhouette_score','DB_score'],ascending=[False, True])        
        self.best_N_cluster = score_df.n_clusters[0]
        
    def agglomerative_n_cluster_optimizer(self):
        self.agglomerative_spectural_n_cluster_optimizer(model_name='agglomerative')
    def spectral_n_cluster_optimizer(self):
        self.agglomerative_spectural_n_cluster_optimizer(model_name='spectral')




class best_clustering_model_selector:
    def __init__(self, df):
        print('-'*50)
        print('Data Loaded')
        self.best_n_cluster = None
        self.data = df
        self.scalers = {
                        'normalization': MinMaxScaler(),
                        'standardization': StandardScaler()
                        }
        self.dimension_reduction={
                        'PCA': PCA(),
                        'UMAP':umap.UMAP(),
                        'TSNE': TSNE()
                        }

        self.clusters = {
                        'KMeans':KMeans(random_state=42),
                        'DBSCAN':DBSCAN(eps=0.5, min_samples=5),
                        'Agglomerative':AgglomerativeClustering(),
                        'Spectral':SpectralClustering(random_state=42)
                        }
        # self.feature_engineering()
        
        
    def prepare_data_for_preprocessing(self):
    # Build preprocessing pipeline for categorical features
        print(self.data.shape)
        self.data.set_index('Customer ID',inplace=True)
        print(self.data.shape)
        
        one_hot_cat_col = []
        ordinal_cat_col = []
        # creating dummies
        numerical_features = self.data.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = self.data.select_dtypes(include=['object']).columns
        for feature in categorical_features:
            unique_values = self.data[feature].nunique()
            if unique_values < 4:
                one_hot_cat_col.append(feature)
            else:
                ordinal_cat_col.append(feature)
        self.one_hot_cat_col = one_hot_cat_col
        self.ordinal_cat_col = ordinal_cat_col
        self.numerical_col = numerical_features
        
    def data_preprocess_pipeline(self):
        # Build preprocessing pipeline for numerical features
        print("** KNN imputer for missing values")
        numerical_pipeline = Pipeline(steps=[
            ('imputer', KNNImputer(n_neighbors=5)),
        #     ('scaler', MinMaxScaler())
        ])
        # Build preprocessing pipeline for categorical features
        print("** One hot encoding for the Categorical features with less than 4 unique values")
        
        onehot_categorical_pipeline = Pipeline(steps=[
            ('onehot_encoding', OneHotEncoder( handle_unknown='ignore'))])
        
        print("** Ordinal encoding for the Categorical features with greater than 4 unique values")

        ordinal_categorical_pipeline = Pipeline(steps=[
            ('ordinal_encoding', OrdinalEncoder())

        ])
        column_transform = ColumnTransformer([
            ('numerical_columns', numerical_pipeline, self.numerical_col),
            ('onehot_categorical_columns', onehot_categorical_pipeline, self.one_hot_cat_col),
            ('ordinal_categorical_columns', ordinal_categorical_pipeline, self.ordinal_cat_col)   
        ])
        self.trans_df = pd.DataFrame(column_transform.fit_transform(self.data)) 
        one_hot_cols = list(onehot_categorical_pipeline.fit(self.data[self.one_hot_cat_col]).get_feature_names_out())   
        trans_cols = list(self.numerical_col)+one_hot_cols+list(self.ordinal_cat_col)  
        self.trans_df.columns = trans_cols 
        
        
    def remove_constant_multicolinear_feature(self):
        ### Removing constant features
        print('** Removing Constant features with 0 std')
        std_df = self.trans_df.describe().T['std']
        const_feature_list = list(std_df[std_df==0].index)
        if len(const_feature_list) > 0:
            self.trans_df.drop(const_feature_list,axis=1,inplace=True)

        ### Reducing number of features using correlation matrix 
        print('** Removing Multicolinear features')
        corr_mat = self.trans_df.corr().abs()
        upper_cor_mat_df = corr_mat.where(np.triu(np.ones(corr_mat.shape),k=1).astype(bool))
        drop_col = [col for col in upper_cor_mat_df.columns if any(upper_cor_mat_df[col] > 0.85)]

        self.data.reset_index(inplace=True)
        self.trans_df = pd.concat([self.data['Customer ID'],self.trans_df],axis=1)
        self.trans_df.drop(drop_col,axis=1,inplace=True)
#         self.trans_df.set_index('Customer ID',inplace=True)
        
    def feature_engineering(self):
        print('-'*50)
        print('Entering Data preprocessing ')
        self.prepare_data_for_preprocessing()
        print('--Column transformation')
        self.data_preprocess_pipeline()
        print('--Feature Removing')
        self.remove_constant_multicolinear_feature()
        
    def scaling_data(self,scaling_method):
        if scaling_method in self.scalers.keys():
            # Build preprocessing pipeline for numerical features
            print("** Scaling Numerical features")
            scaling_pipeline = Pipeline(steps=[('scaler', self.scalers[scaling_method])])
            temp_df = self.trans_df.drop(self.numerical_col,axis=1)
            column_transform = ColumnTransformer([('numerical_columns', scaling_pipeline, self.numerical_col)])
            self.scaled_df = pd.DataFrame(column_transform.fit_transform(self.trans_df))
            self.scaled_df.columns=self.numerical_col
            self.scaled_df = pd.concat([temp_df,self.scaled_df],axis=1)
            self.scaled_df.set_index('Customer ID',inplace=True)
            
        else:
            raise ValueError("Unknown Data Scaling method")
            
       
    
    def reduce_dimensions(self, dimension_method='PCA'):
        if dimension_method in self.dimension_reduction.keys():

            # Create an instance of the class
            n_comp_selector = dimension_redution_optimiser(self.scaled_df)

            # Construct the method name dynamically and call it
            method_name = f'{dimension_method.lower()}_n_components_optimizer'
            # Use getattr to call the method
            method = getattr(n_comp_selector, method_name, None)
            if method:
                method()  # Call the dynamically selected method
                self.dim_red_n_component = n_comp_selector.best_n_components
                self.reducer = self.dimension_reduction[dimension_method]
                self.reducer.n_components = self.dim_red_n_component
            else:
                raise ValueError(f"{method} not found")
            
        else:
            raise ValueError("Unknown dimensionality reduction method")
        
        self.reduced_df = self.reducer.fit_transform(self.scaled_df)
    
    
    
    
    def clustering_data(self, cluster_method='KMeans'):
        if cluster_method in self.clusters.keys():
            
            if cluster_method != 'DBSCAN':
                # Create an instance of the class
                n_cluster_selector = clustering_model_optimizer(self.reduced_df)

                # Construct the method name dynamically and call it
                method_name = f'{cluster_method.lower()}_n_cluster_optimizer'
                # Use getattr to call the method
                method = getattr(n_cluster_selector, method_name, None)
                if method:
                    method()  # Call the dynamically selected method
                    self.best_n_cluster = n_cluster_selector.best_N_cluster
                    self.clusterer = self.clusters[cluster_method]
                    self.clusterer.n_clusters = self.best_n_cluster
                else:
                    raise ValueError(f"{method} not found")
            elif cluster_method == 'DBSCAN':
                self.clusterer = self.clusters[cluster_method]
                
        else:
            raise ValueError("Unknown clustering method")

        if self.best_n_cluster != None or cluster_method == 'DBSCAN':
            self.labels = self.clusterer.fit_predict(self.reduced_df)
            print(f'fit predict for methof {cluster_method}')
            
        else:
            self.labels = []
        print(f'completed cluster_data function dor cluster method {cluster_method}')
        return self.labels
    
    def evaluate_clustering(self):
        silhouette = silhouette_score(self.scaled_df, self.labels)
        davies_bouldin = davies_bouldin_score(self.scaled_df, self.labels)
        return silhouette, davies_bouldin
    
    def prepare_reduced_data(self,scaling_methods,dim_reduction_methods):
        reduce_data_dict ={}
        reduced_data_dict={}
        scaled_data_dict = {}
        for scaling_method in scaling_methods:
            print('-'*50)
            print(scaling_method)
            self.scaling_data(scaling_method)
            scaled_data_dict[scaling_method]=self.scaled_df
            result_dict = {}
            for dim_red_method in dim_reduction_methods:
                print('*' * 50)
                print(f' --{dim_red_method}')
                self.reduce_dimensions(dimension_method=dim_red_method)
                result_dict[dim_red_method]={'data':self.reduced_df,'n_components':self.reducer.n_components}
#                 print(scaling,dim)
            reduced_data_dict[scaling_method] = result_dict
        print("Completed...")
        self.reduced_data_dict = reduced_data_dict
        self.scaled_data_dict = scaled_data_dict


        
      #   With paraller processing 

    # Helper function to perform clustering in parallel
    def clustering_worker(self, params):
        scaling_method, dim_red_method, cluster_method = params
        # self.reduced_df = self.reduced_data_dict[scaling_method][dim_red_method]['data']
        # self.reduce_dimensions(dimension_method=dim_red_method)
        print('*'*50)
        print(scaling_method, dim_red_method, cluster_method)
        self.reduced_df = self.reduced_data_dict[scaling_method][dim_red_method]['data']
        self.scaled_df = self.scaled_data_dict[scaling_method]
        self.clustering_data(cluster_method=cluster_method)
        
        if len(set(self.labels)) > 1:
            silhouette, davies_bouldin = self.evaluate_clustering()
            print('model Evaluation')
            return (scaling_method, dim_red_method, cluster_method, silhouette, davies_bouldin,self.best_n_cluster)
        else:
            return None
            
    def compare_models_parallel(self, scaling_methods, dim_reduction_methods, clustering_methods):
        best_score = -float('inf')
        best_dim_red_method = None
        best_clustering_method = None
        best_labels = None
        best_X_reduced = None
        score_dict = {  
            'norma_method': [],
            'dim_red_method': [],
            'dim_red_n_component': [],
            'clustering_method': [],
            'clustering_n_clusters': [],
            'silhouette_score': [],
            'DB_score': []
        }

        # Create a list of all parameter combinations for parallel processing
        param_combinations = [(scaling_method, dim_red_method, cluster_method)
                              for scaling_method in scaling_methods
                              for dim_red_method in dim_reduction_methods
                              for cluster_method in clustering_methods]
        
        

        # Use multiprocessing to evaluate each combination
        print(f"CPU count: {mp.cpu_count}")
        with mp.Pool(mp.cpu_count()-3) as p:
            results = p.map(self.clustering_worker, param_combinations)

        # Filter out None results (when no clusters were generated)
        results = [result for result in results if result is not None]

        # Process the results
        for result in results:
            scaling_method, dim_red_method, cluster_method, silhouette, davies_bouldin,best_n_cluster = result

            score_dict['norma_method'].append(scaling_method)
            score_dict['dim_red_method'].append(dim_red_method)
            score_dict['dim_red_n_component'].append(self.reduced_data_dict[scaling_method][dim_red_method]['n_components'])
            score_dict['clustering_method'].append(cluster_method)
            score_dict['clustering_n_clusters'].append(best_n_cluster)
            score_dict['silhouette_score'].append(silhouette)
            score_dict['DB_score'].append(davies_bouldin)
            
        self.score_dict = score_dict 
        
    
    
#     def compare_models(self, scaling_methods, dim_reduction_methods, clustering_methods):
#         best_score = -float('inf')
#         best_dim_red_method = None
#         best_clustering_method = None
#         best_labels = None
#         best_X_reduced = None
#         score_dict = {  'norma_method':[],
#                         'dim_red_method':[],
#                         'dim_red_n_component':[],
#                         'clustering_method':[],
#                         'clustering_n_clusters':[],
#                         'silhouette_score':[],
#                         'DB_score':[]}
#         for scaling_method in scaling_methods:
#             print('-'*50)
#             print(scaling_method)
#             # self.scaling_data(scaling_method)

#             for dim_red_method in dim_reduction_methods:
#                 print('*' * 50)
#                 print(f' --{dim_red_method}')

#                 # self.reduce_dimensions(dimension_method=dim_red_method)
#                 self.reduced_df = self.reduced_data_dict[scaling_method][dim_red_method]['data']
#                 self.scaled_df = self.scaled_data_dict[scaling_method]
#                 for cluster_method in clustering_methods:
#                     print(f'  ** Dimensionality Reduction: {dim_red_method}, Clustering: {cluster_method}')

#                     self.clustering_data(cluster_method=cluster_method)
#                     if len(set(self.labels)) >1:
#                         silhouette, davies_bouldin= self.evaluate_clustering()

# #                         print(f'Silhouette Score: {silhouette}')
# #                         print(f'Davies-Bouldin Index: {davies_bouldin}')


#                         score_dict['norma_method'].append(scaling_method)
#                         score_dict['dim_red_method'].append(dim_red_method)
#                         score_dict['dim_red_n_component'].append(self.dim_red_n_component)
#                         score_dict['clustering_method'].append(cluster_method)
#                         score_dict['clustering_n_clusters'].append(self.best_n_cluster)
#                         score_dict['silhouette_score'].append(silhouette)
#                         score_dict['DB_score'].append(davies_bouldin)



#                         # Track the best configuration
#                         score = silhouette  # Choose the metric to maximize, here we use silhouette score
#                         if score > best_score:
#                             best_score = score
#                             best_dim_red_method = dim_red_method
#                             best_clustering_method = cluster_method
#                             best_labels = self.labels
#                             best_X_reduced = self.reduced_df
#                     else:
#                         print('Unable to generate clusters')
                    
                    
#         return best_dim_red_method, best_clustering_method, best_X_reduced, best_labels,score_dict
    
    def plot_clusters(self, X_reduced, labels, title='Cluster Plot'):
        plt.figure(figsize=(10, 6))
        scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=labels, s=50, cmap='viridis')
        plt.colorbar(scatter)
        plt.title(title)
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        plt.show()

In [13]:
df = pd.read_csv("data.csv")


In [14]:
# Define dimensionality reduction and clustering methods
scaling_methods = ['normalization', 'standardization']
dim_reduction_methods = ['PCA', 'UMAP','TSNE']
# dim_reduction_methods = ['PCA',]

# clustering_methods = ['KMeans', 'DBSCAN', 'Agglomerative', 'Spectral']
clustering_methods = ['KMeans', 'DBSCAN', 'Agglomerative',]
clustering_methods = ['KMeans', 'DBSCAN',]

In [15]:
model_selector = best_clustering_model_selector(df)


--------------------------------------------------
Data Loaded


In [16]:
model_selector.prepare_data_for_preprocessing()

(3900, 19)
(3900, 18)


In [19]:
model_selector.data_preprocess_pipeline()

** KNN imputer for missing values
** One hot encoding for the Categorical features with less than 4 unique values
** Ordinal encoding for the Categorical features with greater than 4 unique values


AttributeError: 'Pipeline' object has no attribute 'get_feature_names_out'

In [17]:
model_selector.remove_constant_multicolinear_feature()


** Removing Constant features with 0 std


AttributeError: 'best_clustering_model_selector' object has no attribute 'trans_df'