### A.8 Functions For GMM Clustering Algorithms

In [None]:
def get_cluster_label_gmm(X, para):
    """
    
    Return the cluster labels of data points using GMM 
    
    Parameters:
    -----------
    
    X: an array shape: (n_samples, n_attributes)
        It is data matrix
        
        
    para: a dictionary 
        It contains the parameters of the clustering algorihtms    
        
           
        
    Return:
    -------
    
    label_predicted: an array shape: n_samples
    
        It is the labels assigned by the GMM clustering algorithm
    
    
    """
    
     # Create an object of gmm and fit the model
    c_obj = mixture.GaussianMixture(n_components=para['n_components'],
                                     covariance_type=para['covariance_type'],
                                     init_params= para['init_params'],
                                     n_init=para['n_init'],
                                     random_state=seed_num).fit(X)
    # get the predicted labels
    if hasattr(c_obj, 'labels_'):
        label_predicted = c_obj.labels_.astype(np.int)
    else:
        label_predicted = c_obj.predict(X)
    
    return label_predicted


# ==================================================================================

def get_parameter_gmm(data, display_results = False):
    
    """
        
    Return a dictionary containing the optimal values of parameter for the GaussianMixture Model
    based on Silhouette score and a tuple of Silhouette metrics.
        
    
    Parameters:
    -----------
    
    data:  a tuple
    
    The first element, X, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, labels of clusters, is a one dimensional 
    array of shape (n_samples) representing the cluster label of individual points. 
    
    
        
    Return:
    -------
    
    opt_para, (score_l2, score_l1, score_cosine): a tuple of two elements    
    
        opt_para: a dictionary 
        
        It holds the parameters for GaussianMixture Model that is optimized for the 
        dataset (data) using Silhouette scores.
        
        (score_l2, score_l1, score_cosine): a tuple
        
        From left to right, these are three one-dimensional arrays containing average Silhouette scores for 
        different values of the parameter(s) computed using 'euclidean', 'manhattan', and 'cosine' 
        distance metrics respectively .
        
    
    
    """
    # eps_range and min_point_range are tuple of two elements
    
    X, y_true = data
    
     # get the true number of clusters in the dataset
    if np.any(y_true == -1):
        tot_clusters = len(np.unique(y_true)) - 1
    else:
        tot_clusters = len(np.unique(y_true))
    
    # get range for n_clusters parameter
    
    
    if tot_clusters  < 6:
        c_start = 2
    else:
        c_start = tot_clusters - 3
        
    c_end = tot_clusters + 5
    
    # define parameters
    n_components = list(range(c_start, c_end))
    covariance_type = ['full']#, 'tied', 'diag', 'spherical']
    init_params = ['kmeans']
    n_init = [15]
    
    # create grid of parameters
    param_grid = list(model_selection.ParameterGrid({'n_components': n_components, 
                                                     'covariance_type': covariance_type,
                                                     'init_params':init_params,
                                                     'n_init':n_init}
                                                   )
                     )
    
    if display_results:
    
        print()
        print("Optimizing Parameters: Gaussian mixture model =====")
        print()
        print("Total Parameter Settings: ", len(param_grid))
    
    score_l2, score_l1, score_cosine = [], [], []
    
    for i, para in enumerate(param_grid):

        y_pred = get_cluster_label_gmm(X, para)
    
        s_l2, s_l1, s_cosine = get_performance_metrics(X, y_true, y_pred)
        score_l2.append(s_l2)
        score_l1.append(s_l1)
        score_cosine.append(s_cosine)
        
        # create data frame to inlcude silhouette score including the parameter grid
        param_grid[i]['score_l2'] = s_l2
        if i == 0:
            df = pd.DataFrame([para])
        else:
            df = df.append(pd.DataFrame([para]), ignore_index=True)
        
        
        
#         print()
#         print('{}'.format(i))
#         print('Parameters:  n_components = {}, covariance_type = {}'.format(para['n_components'], para['covariance_type']))
#         print('Scores: l2 = {:.5f}, l1 = {:.5f}, cosine = {:.5f}'.format(s_l2, s_l1, s_cosine))

    #  get optimal parameters based on metric       

    index_l2  = np.argmax(score_l2)
#     index_l1  = np.argmax(score_l1)
#     index_cosine  = np.argmax(score_cosine)
    
   
    
#     # check if multiple indices haves the same max values
#     max_val_l2 = score_l2[index_l2]

#     indices = [i for i, v in enumerate(score_l2) if v == max_val_l2]
#     max_grid_index = np.random.choice(indices)
    
#     opt_para = param_grid[max_grid_index]
    opt_para = param_grid[index_l2]
    
         
#     print()
#     print('GMM')
#     print()
    
#     if len(np.unique([index_l2, index_l1, index_cosine])) == 1:
#         print("Same parameter setting for three Silhouette scores, (l2, l1, and cosine)")
#     else:
#         print("Different parameter setting for three Silhouette scores, (l2, l1, and cosine)")
        
#     print('Indices at Maximum Silhouette Scores With l2: ', indices)    
# #     print()
#     print('{:<15s}:  n_components = {}, covariance_type = {}, init_params = {}, n_init = {}'.format('GMM',opt_para['n_components'], opt_para['covariance_type'], opt_para['init_params'], opt_para['n_init']))
    
    
#     print()
#     print("Summary: Maximum Silhouette Scores At Parameters ==== ")
#     #  summarize the result of optimization in a data frame
 
    if display_results: 
        print()
        for i in range(c_start, c_end):
            # get the index at maximum silhouette score
            ind = int(df.loc[lambda d: d.n_components == i, ['score_l2']].idxmax())
            if i == c_start:
                new_df = df.loc[lambda d: d.index == ind, :]
            else:
                new_df = new_df.append(df.loc[lambda d: d.index == ind, :], ignore_index=True)

        print()
        print(new_df)  
    
    
#     # display silhouette graph 
#     cluster_labels = get_cluster_label_gmm(X, opt_para)
#     silhouette_plot(X, cluster_labels)
    
    
    return opt_para, (score_l2, score_l1, score_cosine)    
      
# ==================================================================================

def display_gmm_outputs(test_samples, para_gmm):
    
    '''
    Dispaly the outputs of the GMM clustering algorithm and Silhouette plot
    
    Parameters: 
    -----------
    
    test_samples:  a tuple
    
    The first element, data matrix, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, clusters labels, is a one dimensional 
    array of shape (n_samples) representing the cluster labels of individual points. 
        
    
    para_gmm: a dictionary
        It contains the parameters of dbscan algorihtms. It looks like 
        para_gmm = {'n_components':3, 'covariance_type':'full', 'init_params':'kmeans'}  
        
    Return:
    ------
    
    Results: a dictionary
        It contains the outputs related to the model obejcts
    
    '''
    
        
    X, y = test_samples
    
#     print('Data points =====')
#     print(X)
    # pairwise_distances(test_samples[0], metric='l2')
    # initialize the algorithm with parameters
    print("Parameters =====")
    print(para_gmm)
    # initialize the algorithm with parameters
    t0=time.time()
    model = mixture.GaussianMixture(n_components=para_gmm['n_components'],
                           covariance_type=para_gmm['covariance_type'],
                           init_params=para_gmm['init_params'],
                           n_init= para_gmm['n_init'],
                           random_state=seed_num)
    # fit the data to the model
    labels = model.fit_predict(X)
    t1=time.time()

#     # get the attributes of the clustering results
#     # print("Labels: ", labels)
#     print("Weights:", model.weights_)
#     print("Cluster of Mixture Components")
#     print(model.means_)
#     print("Covarances:")
#     print(model.covariances_)
#     print("Iterations: ", model.n_iter_)
#     print("AIC:", model.aic(X))
    
    n_col = 4
    n_row = 1
#     f_w = 3.5
#     f_h = 3.5
    
    plt.figure(figsize=(n_col * f_w, n_row * f_h))
    plt.subplot(1, 4, 1)
    plot_dataset(test_samples, name="Original Data")
    plt.subplot(1, 4, 2)
    plot_gmm(model, X)
    
    ax3 = plt.subplot(1, 4, 3)
    ax4 = plt.subplot(1, 4, 4)   
    
    silhouette_plot(X, labels, (ax3, ax4), t1-t0)
    
    results = {'labels':labels,
              'weights':model.weights_,
              'means': model.means_,
              'covarances': model.covariances_,
              'iteration':model.n_iter_,
              'AIC':model.aic(X), # not relevant for clustering
              'converged': model.converged_,
              'tot_time': t1-t0}
       
    return results




def draw_ellipse(position, covariance, ax=None, **kwargs):
    """
    Draw an ellipse with a given position and covariance
    
    Acknowledgement:
    
        
    https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html
    
    
    """
    ax = ax or plt.gca()
    
    # Convert covariance to principal axes
    if covariance.shape == (2, 2):
        U, s, Vt = la.svd(covariance) # column of U are the eigenvectors of covarance * coravance.T
        angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
        width, height = 2 * np.sqrt(s) # 
    else:
        angle = 0
        width, height = 2 * np.sqrt(covariance)
    
    # Draw the Ellipse
    for nsig in range(1, 3): # change the with and height at each position, ie. means
        ax.add_patch(Ellipse(position, nsig * width, nsig * height,
                             angle, **kwargs))
        
def plot_gmm(gmm, X, label=True, ax=None):
    """
    Plot each component of GMM as ellipses
    
    Acknowledgement:    
        
    https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html
    """
    ax = ax or plt.gca()
    labels = gmm.fit(X).predict(X)
    col = np.array(list(islice(cycle(colors), int(max(labels) + 1)
                              )
                       )
                  )
    
    if label:
        ax.scatter(X[:, 0], X[:, 1], c=col[labels], s=size, marker = '.',  
#                    cmap='viridis', 
                   zorder=2)
    else:
        ax.scatter(X[:, 0], X[:, 1], s=size, marker = '.',  zorder=2)
        
    ax.axis('equal')
    
    w_factor = 0.2 / gmm.weights_.max() # higher the weight thicker the ellipse
    
    for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
        draw_ellipse(pos, covar, alpha=w * w_factor)

        
    return None


# 13 August
def test_gmm_varying_n_init(X, test_para, n_inits=None):
    """ 
    Plot the result of GMM by varying the parameter n_init and by fixing n_components
    
    Parameters:
    -----------
    
    X: an array
    
    The data matrix, X, is a 2 dimensional array of shape (n_samples, n_attributes)
    
    n_inits: a list
        Each element holds the range of number of initializations for kmean 
    
    test_para: a dictionary
        It holds the parameters of spectral clustering algorithm
        
    Return:
        None
    
    Example of test parmeter:
    
    test_para   = {'n_components': 2,
                    'covariance_type': 'full',
                    'init_params': 'kmeans',
                     'n_init': 10,
                     'random_state': 123
                     }
    
    """
                 
    for n in n_inits: # range(41, 60): # 
        test_para['n_init'] = n
        print_parameters(test_para)
#         print(test_para)
        label = get_cluster_label_gmm(X, test_para)
        silhouette_plot(X, label)        
        c_label, cluster_size = check_cluster_size(label)
        print('-----------------------')
        print("Total cluster = {} \t Cluster size {}\n".format(len(c_label), cluster_size))
    return None
