### A.11 Functions to Display Datasets

In [1]:
def plot_dataset(data, name=None, axis_name=None ,ax=None):
    """ 
    
    Plot scatter figure of the given dataset
    
   Parameter:
   ---------
   
   data:  a tuple
    
    The first element, data matrix, is a 2 dimensional array of shape 
    (n_samples, n_attributes)and the second element, clusters labels, is a one dimensional 
    array of shape (n_samples) representing the cluster labels of individual points. 
    
    
    name: a string
    
        It holds the name of the dataset
        
    axis_name: a tuple of string
     
        It holds the name of the x-axis and y-axis of the dataset
        
    Return:
    ------
    
    None
    
    
    """
    X, y = data  
    n_cluster = len(np.unique(y))
    
    
    col = np.array(list(islice(cycle(['DarkGreen', 'Orange', 'Blue', 'Brown', 'Teal', 
                                         'Indigo', 'Gold', 'Lime', 'Aqua', 'Red', 'Purple', 
                                         'LightGreen', 'DarkKhaki' , 'Maroon', 'MediumAquamarine', 
                                         'Magenta', 'DeepSkyBlue', 'Grey']),
                                  int(max(y) + 1))
                          )) 
    
    
    colors = cm.nipy_spectral(y / n_cluster)

#         if np.any(np.unique(cluster_labels) == -1):
#             mask = cluster_labels == -1
#             ax2.scatter(X[:, 0][mask], X[:, 1][mask],
#                     marker='*', 
#                     s=size, 
#                     alpha=aph,
#                     c='grey', 
# #                     edgecolor='k'
#                 )

#             ax2.scatter(X[:, 0][~mask], X[:, 1][~mask],
#                     marker='.', 
#                     s=size, 
#                     alpha=aph,
#                     c=colors[~mask], 
# #                     edgecolor='k'
#                 )
   
    
    
    
    ax = ax or plt.gca()
    

    
    # no sturcture
    if n_cluster==1:
        ax.scatter(X[:, 0], X[:, 1], marker = mark[-1], s=size, c='Grey', alpha= aph)
    else:       
        # plot clusters
        for k in np.unique(y):
            mask = y == k
            
            if k == -1: 
                ax.scatter(X[:,0][mask], X[:,1][mask], 
                                            c='Grey', 
                                            marker = mark[k],
                                            s=size, 
                                            alpha= aph,
                                            label='{}'.format(k)
                                           ) 
            else:
                ax.scatter(X[:,0][mask], X[:,1][mask],
#                                             c=col[k],
                                            c=colors[mask],
                                            marker = '.', #mark[k],
                                            s=size, 
                                            alpha= aph,
                                            label='{}'.format(k)
                                           ) 
        
    if name:
        ax.set_title(name)
    else:
        ax.set_title('') 
        
    if axis_name:
        ax.set_xlabel(axis_name[0])
        ax.set_ylabel(axis_name[1])
    else:        
        ax.set_xlabel('Feature 1')
        ax.set_ylabel('Feature 2')  
    
    # to avoid cluttering of legend in the figure
    if n_cluster < 5:
        ax.legend(loc='best', prop={'size': 8}, ncol = n_cluster
                 # mode='expand'
                 )
        
    ax.axis('equal')

    return None



def plot_datasets(data, default_data={'n_features': 2}):
    """
    Plot the given datasets
    
    Parameters:
    ----------
    
    data: a list of tuple of two elements
    
        The first element in the tuple is another tuple of two elements. The inner tuple (D1, D2, etc see example below)
        contains X and y as two elements with array shapes (n_samples, n_attributes) and (n_samples, ) respectively.
        The second element of the tuple (outer tuple) is a dictionary that holds parameters of datasets.
        
        Example of data:
        
        data = [(D1, {'name': 'D1', 'n_cluster': 2}), 
                (D2, {'name': 'D2', 'n_cluster': 3}),  
                (D3, {'name': 'D3', 'n_cluster': 5})
               ]
        
    default_data: a dictionary
    
        It contains the default parameters of the datasets
        
        
    Return:
    ------
    
    None
        
    """

    # create plotting window
    num_datasets = len(data)
    if num_datasets < 5:
        n_col = num_datasets
        n_row = 1    
    else:
        n_col = 5        
        if num_datasets%n_col != 0:
            n_row = (num_datasets//n_col) + 1
        else:
            n_row = (num_datasets//n_col)   

    plt.figure(figsize=(n_col*f_w, n_row*f_h))
    
#     plt.subplots_adjust(top=0.95, bottom=0.08, left=0.10, right=0.95, hspace=0.4,
#                     wspace=0.35)
    plt.subplots_adjust(top=0.92, bottom=0.15, left=0.10, right=0.95, hspace=0.2,
                    wspace=0.35)
    
   
    for i_dataset, (dataset, data_params) in enumerate(data):
        # update parameters with dataset-specific values
        params = default_data.copy()
        params.update(data_params)
        ax = plt.subplot(n_row, n_col, i_dataset+1)
        plot_dataset(data=dataset, ax=ax)

# # save this figure in the folder "images" in the current working directory
# current_dir = %pwd
# image_path = "".join([current_dir, '\\images'])
# plt.savefig("".join([image_path,"\\datasets.png"]))
    return None


# --------------------------------------------------------------------

def plot_3d_dataset(X, label, ax, name=" "):
    """Plot a 3 dimensional dataset"""   
#     ax = ax or plt.gca()
    if X.shape[1] == 2:
        z_points = 0
        x_points = X[:,0]
        y_points = X[:,1]
    elif X.shape[1] == 3:
        z_points = X[:,2]
        x_points = X[:,0]
        y_points = X[:,1]
        
    mask = label == -1
    
    ax.scatter(xs=x_points, ys= y_points,  zs = z_points, 
               c=label, s = size, marker = '.', alpha=aph);
    # if noises are detected then plot it with gray colour
    
    if np.any(mask):    
        ax.scatter(xs=x_points[mask], ys= y_points[mask],
                   zs = z_points[mask], 
                   c='Grey', s = size, marker = '*', alpha=aph);

    
    
    ax.set_xlabel("X")
    ax.set_ylabel("Y")
    ax.set_zlabel("Z")
    ax.set_title(name)
    
    # rotate 3d figure
    ax.view_init(elev=30, azim=-60) # angles in degrees negative azim is clockwise ratation
# #     plt.draw()
    
    
    return None 


def visualize_3d_datasets(data):
    """
    Display the 3d datasets 
    
    
    Example:
    
    data = [(data3d_repeated_D1, {'name': 'Repeated Column'}),
            (data3d_linComb_D1, {'name': 'Linear-Combination'}),
            (data3d_gaussianNoise_D1, {'name': 'Gaussain Noise'})
           ]
    
    """
    n_figures = len(data)
    
    fig = plt.figure(figsize=(n_figures*f_w,1*f_h))
#     plt.subplots_adjust(top=0.95, bottom=0.08, left=0.10, right=0.95, hspace=0.4,
#                     wspace=0.35)
    plt.subplots_adjust(top=0.95, bottom=0.08, left=0.05, right=0.9, hspace=0.2,
                    wspace=0.2)
        
    for i, (D, param) in enumerate(data):
        ax = fig.add_subplot(1,n_figures,i+1, projection = '3d')
#         plot_3d_dataset(D[0],D[1], ax, param['name'])
        plot_3d_dataset(D[0],D[1], ax)
    
    return None



def visualize_3d_datasets_update(data):
    """
    Display the 3d datasets 
    
    
    Example:
    
    data = [(data3d_repeated_D1, {'name': 'Repeated Column'}),
            (data3d_linComb_D1, {'name': 'Linear-Combination'}),
            (data3d_gaussianNoise_D1, {'name': 'Gaussain Noise'})
           ]
    
    """
    n_row = len(data)
    n_col = 4
    
    # changed the figure figure width to display z-azis in the plot
    fig = plt.figure(figsize=(n_col*3.5,n_row*f_h))
#     plt.subplots_adjust(top=0.95, bottom=0.08, left=0.10, right=0.95, hspace=0.4,
#                     wspace=0.35)
    plt.subplots_adjust(top=0.95, bottom=0.08, left=0.05, right=0.9, hspace=0.4,
                    wspace=0.3)
     
    figure_num = 0    
    for i, (D_3d, param) in enumerate(data):
        figure_num += 1
        ax = fig.add_subplot(n_row, n_col, figure_num, projection = '3d', proj_type= 'ortho')
        plot_3d_dataset(D_3d[0],D_3d[1], ax, param['name'])
                  
        # 2d plot        
        figure_num += 1 
        plt.subplot(n_row, n_col, figure_num)
        plot_dataset(data=(D_3d[0][:, (0,1)], D_3d[1]), 
#                      name='x-y', 
                     axis_name=('X', 'Y'))

        figure_num += 1 
        plt.subplot(n_row, n_col, figure_num)
        plot_dataset(data=(D_3d[0][:, (1,2)], D_3d[1]), 
#                      name='y-z', 
                     axis_name=('Y', 'Z'))

        figure_num += 1 
        plt.subplot(n_row, n_col, figure_num)
        plot_dataset(data=(D_3d[0][:, (0,2)], D_3d[1]), 
#                      name='x-z', 
                     axis_name=('X', 'Z'))
    
    return None


# ---------------------------------------------------------------

def plot_cluster_hist(data_tuple):
    """ Plot the distributions of each feature of a cluster in the dataset """
    
    X, label = data_tuple
    cluster_labels = np.unique(label)
    fig = plt.figure(figsize=(f_w*len(cluster_labels), f_h))
#     fig, axes = plt.subplots(nrows=len(cluster_labels), ncols=1)
#     ax0, ax1, ax2, ax3 = axes.flatten()
    
    for cluster_num in cluster_labels:
        mask = label == cluster_num 
        ax = plt.subplot(1, len(cluster_labels), cluster_num+1)
        for X_dim in range(X.shape[1]):            
            _ = ax.hist(X[:,X_dim][mask], color=colors[X_dim], label = 'Dim'+ str(X_dim))
       
        ax.set_title('Cluster {}'.format(cluster_num));
        ax.legend(prop={'size': 10})
        
    return None


def visualize_3d_datasets_in_2d(data):
    """
    Display xy, yz, and xz plane of the 3d dataset 
    
    
    Example:
    
    data = [(data3d_repeated_D1, {'name': 'Repeated Column'}),
            (data3d_linComb_D1, {'name': 'Linear-Combination'}),
            (data3d_gaussianNoise_D1, {'name': 'Gaussain Noise'})
           ]
    
    """
    n_figures = len(data)
    
    fig = plt.figure(figsize=(3*f_w,n_figures*f_h))
    plt.subplots_adjust(top=0.95, bottom=0.08, left=0.10, right=0.95, hspace=0.4,
                    wspace=0.35)
    fig_num = 0    
    for i, (D, param) in enumerate(data):
        fig_num +=1 
        plt.subplot(n_figures, 3, fig_num)
        plot_dataset(data=(D[0][:, (0,1)], D[1]), axis_name=('X', 'Y'))
        fig_num +=1
        plt.subplot(n_figures, 3, fig_num)
        plot_dataset(data=(D[0][:, (1,2)], D[1]),axis_name=('Y', 'Z'))
        fig_num +=1
        plt.subplot(n_figures, 3, fig_num)
        plot_dataset(data=(D[0][:, (0,2)], D[1]), axis_name=('X', 'Z'))
    
    return None
