### A.7 Functions For Spectral Clustering Algorithms

In [None]:

# ==================================================================================

def get_cluster_label_sc(X, para):
    
    """ 
    Return the results (cluster labels) of each data points using Spectral clustering algorithm
        
    Parameters
    ----------
    
    X : an array of size (n_samples, n_attributes)
        It represents data matrix 
    
    para : a dictionary c
        It contains parameters of the Spectral Clustering algorithm 
        
        
    Returns
    -------
    label_predicted : an array of integers, shape: n_samples
    
        It represents cluster labels assigned by the algorihtm.
        
        
    """
    
    # catch warnings related to kneighbors_graph
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="the number of connected components of the " +
            "connectivity matrix is [0-9]{1,2}" +
            " > 1. Completing it to avoid stopping the tree early.",
            category=UserWarning)
        warnings.filterwarnings(
            "ignore",
            message="Graph is not fully connected, spectral embedding" +
            " may not work as expected.",
            category=UserWarning)
       
    
        # create cluster objects based on different eps
#         c_obj = cluster.SpectralClustering(affinity=para['affinity'], 
#                                            n_clusters = para['n_clusters'],
#                                            gamma = para['gamma'], 
#                                            eigen_solver='arpack',
# #                                            n_neighbors=n_neighbors, # default 10
# #                                            assign_labels=para['assign_labels'], # default kmean
#                                            random_state=seed_num).fit(X)

#         print('Outside affinity = rbf')
        if para['affinity'] == 'rbf':   
#             print('Inside affinity = rbf')
            c_obj = cluster.SpectralClustering(affinity=para['affinity'], 
                                               n_clusters = para['n_clusters'],
                                               gamma = para['gamma'], 
                                               eigen_solver='arpack',
                                               assign_labels=para['assign_labels'], # default kmean
                                               random_state=seed_num).fit(X)            
        elif para['affinity'] == 'nearest_neighbors':
#             print('Inside affinity = nearest_neighbors')
            c_obj = cluster.SpectralClustering(affinity=para['affinity'],
                                               n_clusters = para['n_clusters'],
                                               n_neighbors = para['n_neighbors'], 
                                               eigen_solver='arpack',
                                               assign_labels=para['assign_labels'], # default kmean
                                               random_state=seed_num).fit(X)
        elif para['affinity'] == 'precomputed':
#             print('Inside affinity = precomputed')
            c_obj = cluster.SpectralClustering(affinity=para['affinity'],
                                               n_clusters = para['n_clusters'],
                                               n_neighbors = para['n_neighbors'], 
                                               eigen_solver='arpack',
                                               assign_labels=para['assign_labels'], # default kmean
                                               random_state=seed_num).fit(X)


#     print('After creating c_obj')
    # get the predicted labels
    if hasattr(c_obj, 'labels_'):
        label_predicted = c_obj.labels_.astype(np.int)
    else:
        label_predicted = c_obj.predict(X)
        
        
    # affinity matrix   
#     affinity_mat = c_obj.affinity_matrix_
    
    return label_predicted

# ==================================================================================

def get_parameter_sc(data, n_comp=None, display_results=True):
    """
    
    Return a dictionary containing the optimal values of parameter for the Spectral Clustering algorithm
    based on Silhouette score and a tuple of Silhouette metrics.

    
    Parameters:
    -----------
    
    data:  a tuple
    
    The first element, X, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, labels of clusters, is a one dimensional 
    array of shape (n_samples) representing the cluster label of individual points. 
    
    
    n_comp: a scalar
        It holds the value of n_component for spectral clustering algorithm.
        
    display_results: a boolean
        It true display the output of the optimization step
    
    Return:
    -------
    
    opt_para, (score_l2, score_l1, score_cosine): a tuple of two elements    
    
        opt_para: a dictionary 
        
        It holds the parameters for Spectral Clustering algorithm that is optimized for the 
        dataset (data) using Silhouette scores.
        
        (score_l2, score_l1, score_cosine): a tuple
        
        From left to right, these are three one-dimensional arrays containing average Silhouette scores for 
        different values of the parameter(s) computed using 'euclidean', 'manhattan', and 'cosine' 
        distance metrics respectively .
        
    
    ========= Note: ===========
    
    # affinity': ['linear', 'poly', 'chi2', 'sigmoid] yields error
    # X, y = D4
    # obj = cluster.SpectralClustering(n_clusters=3, gamma=1, affinity='rbf').fit(X)
    # Produces erros: LinAlgError: 3-th leading minor of the array is not positive definite
    
    # range for nearest neighbours has been limited for number of samples  > 100 for computational purposes 
    
    """
    
    X, y_true = data
    
     # get the true number of clusters in the dataset
    if np.any(y_true == -1):
        tot_clusters = len(np.unique(y_true)) - 1
    else:
        tot_clusters = len(np.unique(y_true))
    
    # get range for n_clusters parameter
    
    if n_comp == None: # if n_comp is not provided
        if tot_clusters  < 6:
            c_start = 2
        else:
            c_start = tot_clusters - 3

        c_end = tot_clusters + 3
    else: # if true n_comp is provided
        c_start = n_comp
        c_end = n_comp + 1
    
    neighbor_start = tot_clusters #2 #
    # neighbor should be less than equal to n_samples
    max_neighbors = X.shape[0] 
    
    neighbor_end = max_number_of_neighbours # greater than number of clusters present in the dataset # max_neighbors // 3 + 1
    
#     if max_neighbors < 11:
#         neighbor_end = 5
#     elif max_neighbors < 21:
#         neighbor_end = 10
#     elif max_neighbors < 31:
#         neighbor_end = 20
#     elif max_neighbors < 50:
#         neighbor_end = 30
# #     elif max_neighbors > 101:
# #         neighbor_end = 50    
#     elif max_neighbors < 100:
#         neighbor_end = 45
#     else:
#         neighbor_end = 60
        
    
    
    # affinity == 'nearest_neighbors'
    param_grid = list(model_selection.ParameterGrid({'n_clusters': range(c_start, c_end),                                              
                                                'affinity': ['nearest_neighbors'],
                                                'n_neighbors': range(neighbor_start, neighbor_end),
                                                'assign_labels' : ['kmeans'] #, 'discretize']
                                               }
                                              )
                     )


    # affinity == 'precomputed'
#     param_grid = list(model_selection.ParameterGrid({'n_clusters': range(c_start, c_end),                                              
#                                                     'affinity': ['precomputed'],
#                                                     'n_neighbors': range(neighbor_start, neighbor_end),
#                                                     'assign_labels' : ['kmeans'] #, 'discretize']
#                                                    }
#                                                   )
#                          )
   
   

    
    
    # param_grid = model_selection.ParameterGrid({'n_clusters': list(range(c_start, c_end)),
#     #                                            'gamma': [0.5, 1], 
#     #                                             'affinity': ['rbf', 'poly', 'chi2', 'sigmoid'],
#     #                                             'n_neighbors': list(range(c_start, c_end, 3))
#     #                                            })

# #     param_grid = model_selection.ParameterGrid([{'n_clusters': list(range(c_start, c_end)),
# #                                                'gamma': [0.1, 1, 10], 
# #                                                 'affinity': ['rbf']
# #                                                 },
# #                                                 {'n_clusters': list(range(c_start, c_end)),
# #                                                   'affinity': ['nearest_neighbors'],
# #                                                  'n_neighbors': list(range(c_start, c_end, 3))
# #                                                 }
# #                                                ]
# #                                               )
#     # create grid of parameters : got Error 
#     param_grid = model_selection.ParameterGrid({'n_clusters': list(range(c_start, c_end)),
#                                                'gamma': [0.1, 0.3, 0.5, 0.7, 1, 1.5, 5, 10], 
#                                                 'affinity': ['rbf']
#                                                 }                                              
#                                               )
# # -------------------------------------

    if display_results: 
        print()
        print("Optimizing Parameters: Spectral Clustering =====")
        print()
        print("Total Parameter Settings: ", len(param_grid))
# # -------------------------------------

    score_l2, score_l1, score_cosine = [], [], []
    
    for i, para in enumerate(list(param_grid)):
#         print(i)
        y_pred = get_cluster_label_sc(X, para)    
        s_l2, s_l1, s_cosine = get_performance_metrics(X, y_true, y_pred)
        score_l2.append(s_l2)
        score_l1.append(s_l1)
        score_cosine.append(s_cosine)
        # create data frame to inlcude silhouette score including the parameter grid
        param_grid[i]['score_l2'] = s_l2
        if i == 0:
            df = pd.DataFrame([para])
        else:
            df = df.append(pd.DataFrame([para]), ignore_index=True)
            
#         print()
#         print('Grid Index = {}'.format(i))
#         print('Parameters:  n_clusters = {}, affinity = {}, n_neighbors = {}, assign_labels = {}'.format(para['n_clusters'], para['affinity'], para['n_neighbors'], para['assign_labels']))
#         print('Scores: l2 = {:.5f}, l1 = {:.5f}, cosine = {:.5f}'.format(s_l2, s_l1, s_cosine))

    #  get optimal parameters based on metric       

    index_l2  = np.argmax(score_l2)
#     index_l1  = np.argmax(score_l1)
#     index_cosine  = np.argmax(score_cosine)
    
    
    
#     # check if multiple indices haves the same max values
#     max_val_l2 = score_l2[index_l2]
#     indices = [i for i, v in enumerate(score_l2) if v == max_val_l2]
#     max_grid_index = np.random.choice(indices)
    
#     opt_para = param_grid[max_grid_index]
    
    
    opt_para = param_grid[index_l2]  
    
    affinity =  opt_para['affinity']
       
#     print()
#     print('Spectral Clust')
#     print()
    
#     if len(np.unique([index_l2, index_l1, index_cosine])) == 1:
#         print("Same parameter setting for three Silhouette scores, (l2, l1, and cosine)")
#     else:
#         print("Different parameter setting for three Silhouette scores, (l2, l1, and cosine)")
        
#     print('Indices at Maximum Silhouette Scores With l2: ', indices)    
    
# # -------------------------------------
#     if affinity == 'rbf':
#         gamma = opt_para['gamma']
#         print('{:<15s}:  n_clusters = {}, affinity = {}, gamma = {}, assign_labels = {}'.format('Spectral C', opt_para['n_clusters'], opt_para['affinity'], opt_para['gamma'], opt_para['assign_labels']))        
#     elif affinity == 'nearest_neighbors':
#         n_neighbors = opt_para['n_neighbors']
#         print('{:<15s}:  n_clusters = {}, affinity = {}, n_neighbors = {}, assign_labels = {}'.format('Spectral C', opt_para['n_clusters'], opt_para['affinity'], opt_para['n_neighbors'], opt_para['assign_labels']))
#     elif affinity == 'precomputed':
#         n_neighbors = opt_para['n_neighbors']
#         print('{:<15s}:  n_clusters = {}, affinity = {}, n_neighbors = {}, assign_labels = {}'.format('Spectral C', opt_para['n_clusters'], opt_para['affinity'], opt_para['n_neighbors'], opt_para['assign_labels']))
     
    
#     print()
#     print("Summary: Maximum Silhouette Scores At Parameters ==== ")
#     #  summarize the result of optimization in a data frame
    
    if display_results: 
        print()
        for i in range(c_start, c_end):
            # get the index at maximum silhouette score
            ind = int(df.loc[lambda d: d.n_clusters == i, ['score_l2']].idxmax())
            if i == c_start:
                new_df = df.loc[lambda d: d.index == ind, :]
            else:
                new_df = new_df.append(df.loc[lambda d: d.index == ind, :], ignore_index=True)

        print()
        print(new_df)    
        
    
#     # display silhouette graph 
#     print()

#     cluster_labels = get_cluster_label_sc(X, opt_para)
#     silhouette_plot(X, cluster_labels)
# # -------------------------------------

#     return opt_para, (score_l2, score_l1, score_cosine), (param_grid, max_grid_index)      
    return opt_para, (score_l2, score_l1, score_cosine)  


# ==================================================================================

def display_spectral_c_outputs(test_samples, para_sc):
    
    '''
    Dispaly the outputs of the Spectral clustering algorithm and Silhouette plot
    
    Parameters: 
    -----------
    
    test_samples:  a tuple
    
    The first element, data matrix, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, clusters labels, is a one dimensional 
    array of shape (n_samples) representing the cluster labels of individual points. 
        
    
    para_sc: a dictionary
        It contains the parameters of dbscan algorihtms. It looks like 
        para_sc = {'n_clusters': 2,  'affinity':'nearest_neighbors', 'n_neighbors': 10, 'assign_labels':'kmeans'}  
        
    Return:
    ------
    
    result: a dictionary
        It holds the results or attributes of Spectral clustering object.
    
    '''
    
        
    X, y = test_samples
    
#     print('Data points =====')
#     print(X)
    # pairwise_distances(test_samples[0], metric='l2')
    # initialize the algorithm with parameters
    print("Parameters =====")
    print(para_sc)
    model = cluster.SpectralClustering(n_clusters = para_sc['n_clusters'],
                                       affinity = para_sc['affinity'],
                                       n_neighbors = para_sc['n_neighbors'],
                                       assign_labels= para_sc['assign_labels'],
                                       random_state = seed_num                                       
                           )
    # fit the data to the model
    model.fit(X)
    # get the attributes of the clustering results
    cluster_labels = model.labels_
    aff_mat = model.affinity_matrix_.toarray()
    
    n_col = 2
    n_row = 1
#     f_w = 3.5
#     f_h = 3
    
    plt.figure(figsize=(n_col * f_w, n_row * f_h))
    plt.subplot(1, 2, 1)
    plot_dataset(test_samples, name="Original Data")
    plt.subplot(1, 2, 2)
    im = plt.imshow(aff_mat, interpolation='None', cmap='viridis');
    plt.title('Affinity matrix')
    plt.colorbar(im)
    plt.show()
    
    ## STEP 3
    # STEP 5
    # compute eigen value and eigen vectors of affinity matrix for plotting
#     w, v = la.eig(aff_mat)
#     print('Number of Eigen values:', len(w))
#     print('Eigen values:')
#     print(w)
    # print('Eigen vectors:')
    # print(v)

#     # normalize 
#     v = np.divide(v[:,0:2].T, np.sqrt(np.sum(np.power(v[:,0:2], 2), axis=1))).T

#     para, score = get_parameter_kmean((v,y))

#     y_pred_km = get_cluster_label_kmean(v[:,0:2], para)

    # STEP 6
    print('Using Sklearn')
    silhouette_plot(X, cluster_labels)
#     print('Using Steps')
#     silhouette_plot(v, y_pred_km)

    result = {'label':cluster_labels,
             'affinity_matrix': aff_mat,
             }
    
    return result

 
 # August 5

def display_sc_outputs(test_samples, para_sc):
    
    '''
    Dispaly the outputs of the Spectral clustering algorithm and Silhouette plot
    
    Parameters: 
    -----------
    
    test_samples:  a tuple
    
    The first element, data matrix, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, clusters labels, is a one dimensional 
    array of shape (n_samples) representing the cluster labels of individual points. 
        
    
    para_sc: a dictionary
        It contains the parameters of dbscan algorihtms. It looks like 
        para_sc = {'n_clusters': 2,  'affinity':'nearest_neighbors', 'n_neighbors': 10, 'assign_labels':'kmeans'}  
        
    Return:
    ------
    
    result: a dictionary
        It holds the results or attributes of Spectral clustering object.
    
    '''
    
        
        
    X, y = test_samples
    
#     print('Data points =====')
#     print(X)
    # pairwise_distances(test_samples[0], metric='l2')
    # initialize the algorithm with parameters
    print("Parameters =====")
#     print(para_sc)
    print('n_components = {}, affinity = {}, n-neighbors = {}, assign_labels = {}'.\
          format(para_sc['n_clusters'], para_sc['affinity'], para_sc['n_neighbors'],para_sc['assign_labels']))
    print()
    
    # catch warnings related to kneighbors_graph
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="the number of connected components of the " +
            "connectivity matrix is [0-9]{1,2}" +
            " > 1. Completing it to avoid stopping the tree early.",
            category=UserWarning)
        warnings.filterwarnings(
            "ignore",
            message="Graph is not fully connected, spectral embedding" +
            " may not work as expected.",
            category=UserWarning)
        
        t0=time.time()
        model = cluster.SpectralClustering(n_clusters = para_sc['n_clusters'],
                                           affinity = para_sc['affinity'],
                                           n_neighbors = para_sc['n_neighbors'],
                                           assign_labels= para_sc['assign_labels'],
                                           random_state = seed_num                                       
                               )
        # fit the data to the model
        model.fit(X)
        t1=time.time()
        
        # get the attributes of the clustering results
        cluster_labels = model.labels_
        aff_mat = model.affinity_matrix_.toarray()
    
    n_col = 4
    n_row = 1
#     f_w = 3.5
#     f_h = 3
    
    plt.figure(figsize=(n_col * f_w, n_row * f_h))
    plt.subplot(1, 4, 1)
    plot_dataset(test_samples, name="Original Data")
    plt.subplot(1, 4, 2)
    im = plt.imshow(aff_mat, interpolation='None', cmap='viridis');
    plt.title('Affinity matrix')
    plt.colorbar(im)
    #plt.show()

    ax3 = plt.subplot(1, 4, 3)
    ax4 = plt.subplot(1, 4, 4)  
    
    silhouette_plot(X, cluster_labels, (ax3, ax4), t1-t0)

    
    result = {'labels':cluster_labels,
             'affinity_matrix': aff_mat,
              'tot_time': t1-t0
             }
    
    return result


# Test the result of Spectral clustering
def test_sc_varying_neighbour(X, test_para, n_neighbour_range=None):
    """ 
    Plot the result of spectral clustering by varying the parameter n_neighbour and by fixing n_components
    
    Parameters:
    -----------
    
    X: an array
    
    The data matrix, X, is a 2 dimensional array of shape (n_samples, n_attributes)
    
    n_neighbour_range: a tuple
        It holds the range of number of neighbours 
    
    test_para: a dictionary
        It holds the parameters of spectral clustering algorithm
        
    Return:
        None
    
    Example of test parmeter:
    
    test_para   = {'n_clusters': 2, 
                    'affinity':'nearest_neighbors', 
                    'n_neighbors': 7, 
                    'assign_labels':'kmeans'}
    
    """
    
    if n_neighbour_range==None:
        n_start = 2
        n_end = test_para['n_components']*2 + 3
    else:
        n_start = n_neighbour_range[0] 
        n_end = n_neighbour_range[1]
        
    for n_neig in range(n_start, n_end): # range(41, 60): # 
        test_para['n_neighbors'] = n_neig
        print_parameters(test_para)
#         print(test_para)
        label = get_cluster_label_sc(X, test_para)
        silhouette_plot(X, label)        
        c_label, cluster_size = check_cluster_size(label)
        print('-----------------------')
        print("Total cluster = {} \t Cluster size {}\n".format(len(c_label), cluster_size))
    return None

