### A.5 Functions For Agglomerative Clustering Algorithms

In [None]:

# ==================================================================================

# def get_cluster_label_agglomerative(X, para):
     
        
#     """ 
#     Return the results (cluster labels) of each data points using Agglomerative clustering algorithm
    
    
#     Parameters:
#     -----------
    
#     X: an array shape: (n_samples, n_attributes)
#         It is data matrix
        
        
#     para: a dictionary 
#         It contains the parameters of the clustering algorihtms    
        
           
        
#     Return:
#     -------
    
#     label_predicted: an array shape: n_samples
    
#         It is the labels assigned by the Kmean clustering algorithm
    
    
    
#     """
#     # Create an object of agglomerative clustering and fit the model
#     c_obj = cluster.AgglomerativeClustering(n_clusters=para['n_clusters'], 
#                                             affinity = para['affinity'],
#                                             linkage  = para['linkage']).fit(X) # no random_state
#     # get the predicted labels
#     if hasattr(c_obj, 'labels_'):
#         label_predicted = c_obj.labels_.astype(np.int)
#     else:
#         label_predicted = c_obj.predict(X)
    
#     return label_predicted



def get_cluster_label_agglo(X, para):
    
    """
    Return the results (cluster labels) of each data points using Agglomerative clustering algorithm
    
    Note: Using Scipy packages
    
    Parameters:
    -----------
    
    X: an array shape: (n_samples, n_attributes)
    
        It is data matrix
        
        
    para: a dictionary 
        It contains the parameters of the clustering algorihtms    
        
           
        
    Return:
    -------
    
    y_pred: an array shape: n_samples
    
        It is the labels assigned by the Kmean clustering algorithm
    
    """
    
    Z = linkage(X, method=para['linkage'], metric=para['affinity'])
    y_pred = fcluster(Z, t = para['n_clusters'], criterion = 'maxclust') - 1# -1 to start label from 0 not 1
#     y_pred = fcluster(Z, t = para['n_clusters'], criterion = 'inconsistent') - 1
    
    return y_pred



# ==================================================================================

def get_parameter_agglo(data, linkage=None, display_results=False):
    
    """ 
    Return a dictionary containing the optimal values of parameter for Agglomerative algorithm
    based on Silhouette score and a tuple of Silhouette metrics
    
    Parameters:
    -----------
    
    data:  a tuple
    
    The first element, X, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, labels of clusters, is a one dimensional 
    array of shape (n_samples) representing the cluster label of individual points. 
    
    
    linkage: a string
        It represent the linkage method to be used for the algorithm.
        
        
    Return:
    -------
    
    opt_para, (score_l2, score_l1, score_cosine): a tuple of two elements    
    
        opt_para: a dictionary 
        
        It holds the parameters for Agglomerative clustering algorithm that is optimized for the 
        dataset (data) using Silhouette scores
        
        (score_l2, score_l1, score_cosine): a tuple
        
        From left to right, these are three one-dimensional arrays containing average Silhouette scores for 
        different values of the parameter(s) computed using 'euclidean', 'manhattan', and 'cosine' 
        distance metrics respectively 
        
      
    """
    
    X, y_true = data
    
    
    # get the true number of clusters in the dataset
    if np.any(y_true == -1):
        tot_clusters = len(np.unique(y_true)) - 1
    else:
        tot_clusters = len(np.unique(y_true))
    
    # get range for n_clusters parameter
    if tot_clusters  < 6:
        c_start = 2
    else:
        c_start = tot_clusters - 3
    
    
    c_end = tot_clusters + 3
    
    if linkage == 'single':
        para = {'n_clusters': list(range(c_start, c_end)), \
                'linkage': ["single"], \
                'affinity': ['euclidean']#, 'manhattan', 'cosine'] # , 'manhattan', 'cosine'] ward only work with euclidean
               }
    elif linkage == 'complete':
        para = {'n_clusters': list(range(c_start, c_end)), \
                'linkage': ['complete'], \
                'affinity': ['euclidean']#, 'manhattan', 'cosine'] # , 'manhattan', 'cosine'] ward only work with euclidean
               }
        
    elif linkage == 'average':
        para = {'n_clusters': list(range(c_start, c_end)), \
                'linkage': ['average'], \
                'affinity': ['euclidean']#, 'manhattan', 'cosine'] # , 'manhattan', 'cosine'] ward only work with euclidean
               }
    elif linkage == 'ward':
        para = {'n_clusters': list(range(c_start, c_end)), \
                'linkage': ['ward'], \
                'affinity': ['euclidean'] # , 'manhattan', 'cosine'] ward only work with euclidean
               } 

    # create a grid of paramaters
    param_grid = list(model_selection.ParameterGrid(para))
    
    if display_results:     
        print()
        print("Optimizing Parameters: Agglomerative {}-linkage =====".format(linkage))
        print()
        print("Total Parameter Settings: ", len(param_grid))
       
    score_l2, score_l1, score_cosine = [], [], []
    
    for i, para in enumerate(param_grid):

#         y_pred = get_cluster_label_agglomerative(X, para)
        y_pred = get_cluster_label_agglo(X, para)
    
        s_l2, s_l1, s_cosine = get_performance_metrics(X, y_true, y_pred)
        score_l2.append(s_l2)
        score_l1.append(s_l1)
        score_cosine.append(s_cosine)
        
        # create data frame to inlcude silhouette score including the parameter grid
        param_grid[i]['score_l2'] = s_l2
        
        if i == 0:
            df = pd.DataFrame([para])
        else:
            df = df.append(pd.DataFrame([para]), ignore_index=True)

    #  get optimal parameters based on metric       

    index_l2  = np.argmax(score_l2)
#     index_l1  = np.argmax(score_l1)
#     index_cosine  = np.argmax(score_cosine)
    
    
#     # check if multiple indices haves the same max values
#     max_val_l2 = score_l2[index_l2]
#     indices = [i for i, v in enumerate(score_l2) if v == max_val_l2]
#     max_grid_index = np.random.choice(indices)
    
#     opt_para = param_grid[max_grid_index]
    
    opt_para = param_grid[index_l2]
    
#     print()
#     print('Agglomerative-{}---'.format(linkage))
#     print()
    
#     if len(np.unique([index_l2, index_l1, index_cosine])) == 1:
#         print("Same parameter setting for three Silhouette scores, (l2, l1, and cosine)")
#     else:
#         print("Different parameter setting for three Silhouette scores, (l2, l1, and cosine)")
        
#     print('Indices at Maximum Silhouette Scores With l2: ', indices) 

    
    
#     print('{:<15s}:  n_cluster = {}, linkage = {}, affinity = {}'.format('Agglomerative', opt_para['n_clusters'], opt_para['linkage'],  opt_para['affinity']))
    
#     print()
#     print("Summary: Maximum Silhouette Scores At Parameters ==== ")
#     #  summarize the result of optimization in a data frame

    if display_results: 
        print()
        for i in range(c_start, c_end):
            # get the index at maximum silhouette score
            ind = int(df.loc[lambda d: d.n_clusters == i, ['score_l2']].idxmax())
            if i == c_start:
                new_df = df.loc[lambda d: d.index == ind, :]
            else:
                new_df = new_df.append(df.loc[lambda d: d.index == ind, :], ignore_index=True)

        print()
        print(new_df)
    
    
    
#     # display silhouette graph 
# #     cluster_labels = get_cluster_label_agglomerative(X, opt_para)
#     cluster_labels = get_cluster_label_agglo(X, opt_para)
#     silhouette_plot(X, cluster_labels)
   
    return opt_para, (score_l2, score_l1, score_cosine)



# ==================================================================================

def plot_dendrogram(model, **kwargs):
    
    """
    Return linkage matrix needs to plot dendrogram and plot dendrogram using sklearn object
    
    Parameter:
    ---------
    
    model: an sklearn object
        It contains results associated with agglomerative clustering algorithms
        
    Return:
    ------
    
    linkage_matrix: an array
    
    
        
    Acknowledgement: 
    
    I thank you the autor Mathew Kallada for creating this fucniton. 
    This link for the original codes can be found via following link.
    
    https://github.com/scikit-learn/scikit-learn/blob/70cf4a676caa2d2dad2e3f6e4478d64bcb0506f7/examples/cluster/plot_hierarchical_clustering_dendrogram.py
    
    >>> plot_dendrogram(model, labels=model.labels_)
    
    """
   
    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
    
    return linkage_matrix

# ==================================================================================

def display_agglomerative_outputs(test_samples, para_agglo):
    """
    
    Dispaly the outputs of the agglomerative clustering algorithm and Silhouette plot
    
    Parameters: 
    -----------
    
    test_samples:  a tuple
    
    The first element, data matrix, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, clusters labels, is a one dimensional 
    array of shape (n_samples) representing the cluster labels of individual points. 
        
    
    para_km: a dictionary
        It contains the parameters of Kmean algorihtms. It looks like 
        para_km = {'n_clusters': 2, 'init': 'k-means++', 'precompute_distances': True, 'algorithm': 'auto'}
        
    Return:
    ------
    
    results: a dictionary
        It contains the results obtained from the object of the algorithm.
    
    """
    
        
    X, y = test_samples
    
#     print('Data points =====')
#     print(X)
    # pairwise_distances(test_samples[0], metric='l2')
    # initialize the algorithm with parameters
    print("Parameters =====")
    print(para_agglo)
#     model = cluster.AgglomerativeClustering(n_clusters=para_agglo['n_clusters'],
#                                             linkage=para_agglo['linkage'], 
#                                             affinity=para_agglo['affinity'])
 
    t0=time.time()
    Z = linkage(X, method=para_agglo['linkage'], metric=para_agglo['affinity'])
    label = fcluster(Z, t=para_agglo['n_clusters'], criterion='maxclust') - 1
    t1=time.time()
    
#     label = fcluster(Z, t=para_agglo['n_clusters'], criterion='inconsistent') - 1
   
    
#     # fit the data to the model
#     model.fit(X)
    
    n_col = 4
    n_row = 1
#     f_w = 3.5
#     f_h = 3
    
    plt.figure(figsize=(n_col * f_w, n_row * f_h))
    plt.subplot(1, 4, 1)
    plot_dataset(test_samples, "Original Data")
    plt.subplot(1, 4, 2)
    plt.title("Dendrogram-{}".format(para_agglo['linkage']))
    dendrogram(Z)
    
    ax3 = plt.subplot(1, 4, 3)
    ax4 = plt.subplot(1, 4, 4)   
    
    silhouette_plot(X, label, (ax3, ax4), t1-t0)
    
    
    
    
    # create linakge matrix and plot dendrogram
#     linkage_mat = plot_dendrogram(model)
    
    
    # extract results from the model
#     label = model.labels_
#     n_cluster = model.n_clusters_
#     n_leaves = model.n_leaves_
#     n_connected_components = model.n_connected_components_
#     children = model.children_
    
#     silhouette_plot(X, label)
#     print()
#     print('labels: ', model.labels_)
#     print('n_clusters: ', model.n_clusters_)
#     print('n_leaves: ', model.n_leaves_)
#     print('n_connected_components: ', model.n_connected_components_)
#     print('children: ', model.children_)
#     print('linkage_matrix: ', linkage_mat)
    
    
    results = {'label': label, 
#               'n_cluster':n_cluster, 
#               'n_leave': n_leaves, 
#               'n_connected_components': n_connected_components,
#               'children': children,
               'Z': Z,
               'tot_time': t1-t0
              }
    
    return  results

#     return Z
