## A.4 Functions For Kmean Clustering Algorithms

In [None]:

def get_cluster_label_kmean(X, para, is_model=False):
    """ 
    Return the results (cluster labels) of each data points using Kmean clustering algorithm
    
    
    Parameters:
    -----------
    
    X: an array shape: (n_samples, n_attributes)
        It is data matrix
        
        
    para: a dictionary 
        It contains the parameters of the clustering algorihtms    
        
    
    is_model: a boolean
        True - return model and labels
        False - only return labels
        
    Return:
    -------
    
    label_predicted: an array shape: n_samples
    
        It is the labels assigned by the Kmean clustering algorithm
    
    
    """
    # Create an object of KMeans class and fit the model
    c_obj = cluster.KMeans(n_clusters=para['n_clusters'],
                           init=para['init'],
                           algorithm=para['algorithm'],
                           random_state=seed_num).fit(X)
    # get the predicted labels
    if hasattr(c_obj, 'labels_'):
        label_predicted = c_obj.labels_.astype(np.int)
    else:
        label_predicted = c_obj.predict(X)
    
    # August 15
    if is_model:
        return (label_predicted, c_obj)
    else:
        return label_predicted


# ==================================================================================

def get_parameter_kmean(data, display_results = True):
    
    """ 
    Return a dictionary containing the optimal values of parameter for KMean algorithm
    based on Silhouette score
    
    Parameters:
    -----------
    
    data:  a tuple
    
    The first element, X, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, labels of clusters, is a one dimensional 
    array of shape (n_samples) representing the cluster label of individual points. 
    
    
    display_results: a boolean
        It true display the output of the optimization step
        
    Return:
    -------
    
    opt_para, (score_l2, score_l1, score_cosine): a tuple of two elements    
    
        opt_para: a dictionary 
        
        It holds the parameters for KMean clustering algorithm that is optimized for the 
        dataset (data) using Silhouette scores
        
        (score_l2, score_l1, score_cosine): a tuple
        
        From left to right, these are three one-dimensional arrays containing average Silhouette scores for 
        different values of the parameter(s) computed using 'euclidean', 'manhattan', and 'cosine' 
        distance metrics respectively 
        
      
    """
    
    X, y_true = data
    
    # get the true number of clusters in the dataset
    if np.any(y_true == -1):
        tot_clusters = len(np.unique(y_true)) - 1
    else:
        tot_clusters = len(np.unique(y_true))
          
    
    
    
    # create a grid for number of clusters to be looked at
    if tot_clusters  < 6:
        c_start = 2
    else:
        c_start = tot_clusters - 3
        
    c_end = tot_clusters + 3
    
    params = {'n_clusters': list(range(c_start, c_end)),
              'init' : ['k-means++'], #, 'random'],
              'algorithm': ['full'] #, 'elkan']
             }
                       
    param_grid = list(model_selection.ParameterGrid(params))
 
    if display_results: 
        print()
        print("Optimizing Parameters:  Kmean =====")
        print()
        print("Total Parameter Settings: ", len(param_grid))
    

    # store list of tuples containing three silhouette scores using 'euclidean', 'l1' and 'cosine' distance matrics

    score_l2, score_l1, score_cosine = [], [], []
    
#     t0 = time.time()
    # get results
    for i, para in enumerate(param_grid):
        y_pred = get_cluster_label_kmean(X, para)

        s_l2, s_l1, s_cosine = get_performance_metrics(X, y_true, y_pred)
        score_l2.append(s_l2)
        score_l1.append(s_l1)
        score_cosine.append(s_cosine)
        
#         # create data frame to inlcude silhouette score including the parameter grid
        param_grid[i]['score_l2'] = s_l2
        if i == 0:
            df = pd.DataFrame([para])
        else:
            df = df.append(pd.DataFrame([para]), ignore_index=True)
            
#     t1 = time.time()

    #  get optimal parameters based on metric       
    index_l2  = np.argmax(score_l2)

#     # check if multiple indices haves the same max values
#     max_val_l2 = score_l2[index_l2]
#     indices = [i for i, v in enumerate(score_l2) if v == max_val_l2]
#     max_grid_index = np.random.choice(indices)
    
#     print('Indices at Maximum Silhouette Scores With l2: ', indices)

#     opt_para = param_grid[max_grid_index] # only return l2 scores
    opt_para = param_grid[index_l2] # only return l2 scores
    
#     print('{:<15s}:  n_clusters = {}, init = {}, algorithm = {}'.format('Kmean',
#                                                                         opt_para['n_clusters'], 
#                                                                         opt_para['init'], 
#                                                                         opt_para['algorithm']))
    
    
#     print()
#     print("Summary: Maximum Silhouette Scores At Parameters ==== ")
#     #  summarize the result of optimization in a data frame
    if display_results:  
        print()
    # #     t2 = time.time()
        for i in range(c_start, c_end):
            # get the index at maximum silhouette score
            ind = int(df.loc[lambda d: d.n_clusters == i, ['score_l2']].idxmax())
            if i == c_start:
                new_df = df.loc[lambda d: d.index == ind, :]
            else:
                new_df = new_df.append(df.loc[lambda d: d.index == ind, :], ignore_index=True)

    #     t3 = time.time()
        print()
        print(new_df) 

    
    # display silhouette graph 
#     cluster_labels = get_cluster_label_kmean(X, opt_para)
#     t4 = time.time()
#     silhouette_plot(X, cluster_labels)
#     t5 = time.time()
    
#     print('Time Summary:')
#     print("")
#     print('Optimizing Time + Parameter Data Frame Creation = {:.5f}'.format(t1-t0))
#     print('New Data Frame Creation = {:.5f}'.format(t3-t2))
#     print('Silhouette Plot Creation = {:.5f}'.format(t5-t4))
    
#     # display score trend
#     plt.figure(figsize=(3.5, 3))
#     plt.plot(list(range(0, len(result))), result, '-^')
#     plt.xlabel('Parameter Grid Index')
#     plt.ylabel('Silhouette Average Score')
#     plt.show()
    
    return opt_para, (score_l2, score_l1, score_cosine)

# ==================================================================================

def display_kmean_outputs(test_samples, para_km, name=None):
    """
    
    Dispaly the outputs of the kmean clustering algorithm and Silhouette plot
    
    Parameters: 
    -----------
    
    test_samples:  a tuple
    
    The first element, data matrix, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, clusters labels, is a one dimensional 
    array of shape (n_samples) representing the cluster labels of individual points. 
        
    
    para_km: a dictionary
        It contains the parameters of Kmean algorihtms. It looks like 
        para_km = {'n_clusters': 2, 'init': 'k-means++', 'precompute_distances': True, 'algorithm': 'auto'}
        
    name: a string
        It represents the name of the dataset
        
        
    Return:
    ------
    
    result: a dictionary
    
        It holds the results of clustering algorithms
    
    """
    
        
    X, y = test_samples
    
#     print('Data points =====')
#     print(X)
    
    print("Parameters =====")
    print(para_km)
    t0=time.time()
    model = cluster.KMeans(n_clusters = para_km['n_clusters'],
                           init = para_km['init'],
#                            precompute_distances = para_km['precompute_distances'],
                           algorithm = para_km['algorithm'],
#                            max_iter = para_km['max_iter'],
                           random_state=seed_num)
    # fit the data to the model
    model.fit(X)
    t1=time.time()
    
    n_col = 4
    n_row = 1
#     f_w = 3
#     f_h = 3
    
    plt.figure(figsize=(n_col * f_w, n_row * f_h))
    plt.subplot(1, 4, 1)
    plot_dataset(test_samples, "Original Data")
    
    plt.subplot(1, 4, 2)
    ax2 = plt.gca()
    
    ax2.set_title('Kmean')
    plot_kmeans(model, X, ax=ax2)
    
    ax3 = plt.subplot(1, 4, 3)
    ax4 = plt.subplot(1, 4, 4)   
    
    silhouette_plot(X, model.labels_, (ax3, ax4), t1-t0)
    
#     silhouette_plot(X, model.labels_)
    
    result = {'label': model.labels_,
             'cluster_centers': model.cluster_centers_,
             'iterations': model.n_iter_,
             'tot_time': t1-t0}
    
    return result



def display_kmean_outputs_save(test_samples, para_km, name=None):
    """
    
    Dispaly the outputs of the kmean clustering algorithm and Silhouette plot
    
    Parameters: 
    -----------
    
    test_samples:  a tuple
    
    The first element, data matrix, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, clusters labels, is a one dimensional 
    array of shape (n_samples) representing the cluster labels of individual points. 
        
    
    para_km: a dictionary
        It contains the parameters of Kmean algorihtms. It looks like 
        para_km = {'n_clusters': 2, 'init': 'k-means++', 'precompute_distances': True, 'algorithm': 'auto'}
        
    name: a string
        It represents the name of the dataset
        
        
    Return:
    ------
    
    result: a dictionary
    
        It holds the results of clustering algorithms
    
    """
    
        
    X, y = test_samples
    
#     print('Data points =====')
#     print(X)
    
    print("Parameters =====")
    print(para_km)
    t0=time.time()
    model = cluster.KMeans(n_clusters = para_km['n_clusters'],
                           init = para_km['init'],
#                            precompute_distances = para_km['precompute_distances'],
                           algorithm = para_km['algorithm'],
#                            max_iter = para_km['max_iter'],
                           random_state=seed_num)
    # fit the data to the model
    model.fit(X)
    t1=time.time()
    
    n_col = 2
    n_row = 1
#     f_w = 3
#     f_h = 3
    
    plt.figure(figsize=(n_col * f_w, n_row * f_h))
#     plt.subplot(1, n_col, 1)
#     plot_dataset(test_samples, "Original Data")
    
#     plt.subplot(1, n_col, 2)
#     ax2 = plt.gca()
    
#     ax2.set_title('Kmean')
#     plot_kmeans(model, X, ax=ax2)
    
    ax1 = plt.subplot(1, n_col, 1)
    ax2 = plt.subplot(1, n_col, 2)   
    
#     silhouette_plot(X, model.labels_, (ax1, ax2), t1-t0)
    silhouette_plot(X, model.labels_, (ax1, ax2))
    
#     silhouette_plot(X, model.labels_)
    
    result = {'label': model.labels_,
             'cluster_centers': model.cluster_centers_,
             'iterations': model.n_iter_,
             'tot_time': t1-t0}
    
    return result





def plot_kmeans(kmeans, X, ax=None):
    
    """
    
    Acknowledgement:
    
    https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html
    """
    labels = kmeans.fit_predict(X)
    
    col = np.array(list(islice(cycle(['DarkGreen', 'Orange', 'Blue', 'Brown', 'Teal', 
                                         'Indigo', 'Gold', 'Lime', 'Aqua', 'Red', 'Purple', 
                                         'LightGreen', 'DarkKhaki' , 'Maroon', 'MediumAquamarine', 
                                         'Magenta', 'DeepSkyBlue', 'Grey']),
                                  int(max(labels) + 1))
                          ))

    # plot the input data
    ax = ax or plt.gca()
    ax.scatter(X[:, 0], X[:, 1], c=col[labels], s=size, marker = '.',
               zorder=2 # plot point later, ie, in front
              )

    # plot the representation of the KMeans model
    centers = kmeans.cluster_centers_
    radii = [cdist(X[labels == i], [center]).max()
             for i, center in enumerate(centers)]
    for c, r in zip(centers, radii):
        ax.add_patch(Circle(xy=c, radius=r, facecolor='#CCCCCC', lw=3, alpha=0.5, 
                                        zorder=1 # plot circle first, ie, behind the points
                                       )
                            )
    ax.axis('equal')
    
    return None