### A.9 Function to Optimize Parameters of Clustering Algorithms

In [1]:

# ==================================================================================

def optimize_algorithms(dataset, params=None, display_results=True):
    """
    Return optimized paramaters of the algorithms as a dictionary for each dataset
    
    Parameters:
    ----------
    
    dataset: a tuple of data matrix X and true label y
        The first item, X, is an array with the shape (n_samples, n_attributes) 
        and the second item y is an array of integers, shape n_samples, representing
        the labels of datapoints.
        
    params: a dictionary
        It contains the parameters of the dataset
        
    display_results: a boolean
        If true the output of optimization step will be displayed
        
    Returns:
    -------
    
    optimal_algos_para: a dictionary
        It contains optimized parameters of the algorithms
    
    """
           
    # unpack dataset
    X, y_true = dataset
    
    t0 = time.time()
    # print time
#     print()
#     print('Parameter Optimization =====')
#     print()
    #### Kmean #####    
    para_km, score_km = get_parameter_kmean(dataset, display_results=display_results)
    t2 = time.time()
    #### Agglomerative- single #####
    para_single, score_single = get_parameter_agglo(dataset, linkage='single', display_results=display_results)
    t3 = time.time()
    #### Agglomerative- complete #####
    para_complete, score_complete = get_parameter_agglo(dataset, linkage='complete', display_results=display_results) 
    t4 = time.time()
    #### Agglomerative- average #####
    para_average, score_average = get_parameter_agglo(dataset, linkage='average', display_results=display_results)  
    t5 = time.time()
    #### Agglomerative-ward #####
    para_ward, score_ward = get_parameter_agglo(dataset, linkage='ward', display_results=display_results) 
    t6 = time.time()
    #### DBSCAN #####
    para_dbscan, score_dbscan = get_parameter_dbscan(dataset, eps_range, min_point_range, 50, display_results=display_results)
    t7 = time.time()
     #### Spectral clustering ##### 30th May
# LinAlgError: 3-th leading minor of the array is not positive definite
#     para_sc, score_sc, para_gd = get_parameter_sc(dataset, tot_clusters = params['n_cluster'])  
    para_sc, score_sc = get_parameter_sc(dataset, display_results=display_results) 
    t8 = time.time()
    #### Gaussina Mixture Models ##### 1st June
    para_gmm, score_gmm = get_parameter_gmm(dataset, display_results=display_results) 
    t9 = time.time()
    
    # store the optimal parameters of algorithms
    optimal_algos_para = {'kmean': para_km, 
                          'dbscan': para_dbscan,
                          'spectral_c': para_sc,
                          'gmm': para_gmm,
                          'agglo-single': para_single,
                          'agglo-complete': para_complete,
                          'agglo-average': para_average,
                          'agglo-ward': para_ward,
                         }
    
    
    clustering_time = {'kmean': np.round(t2-t0,4), 
#                        'agglo': np.round(t2-t1,4),                       
                       'dbscan': np.round(t7-t6,4),
                       'spectral_c':np.round(t8-t7,4),
                       'gmm': np.round(t9-t8,4),
                       'agglo-single': np.round(t3-t2,4),
                       'agglo-complete': np.round(t4-t3,4),
                       'agglo-average': np.round(t5-t4,4),
                       'agglo-ward': np.round(t6-t5,4),
                       'total time': np.round(t9-t0,4)}
    
    
#     print('')
#     print('===== Optimized Parameters =====')
#     print('')
#     for algo, para in optimal_algos_para.items():
#         print('{:<15s}: {}'.format(algo, para))

    
#     # print time
#     print('')
#     print('Optimization Time =====')
#     print('')
#     for algo in clustering_time:
#         print('{:<15s}: {:.4f} seconds'.format(algo, clustering_time[algo]))

        
#     # store the optimal parameters of algorithms
#     score_algos = {'kmean': score_km, # tuple of three elements
#                    'agglo-single': score_single,
#                     'agglo-complete': score_complete,
#                     'agglo-average': score_average,
#                     'agglo-ward': score_ward,
#                     'dbscan': score_dbscan,
#                     'spectral_c': score_sc,
#                     'gmm': score_gmm
#                     }
    
#     # Plot performance metric for each algorithm
#     print('')
#     print('===== Plotting Silhouette Scores for Algorithms =====')
#     print('')
#     plot_scores(score_algos)
    
    return optimal_algos_para     
