In [1]:
#Loading the modules required
import numpy as my_npy
import pandas as my_pnds
import matplotlib.pyplot as mypltlib
%matplotlib inline
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import normalize

#Loading the data file
df = my_pnds.read_csv("BR_mod.csv")
df.head()

# computing number of rows
rows = len(df.axes[0])
  
# computing number of columns
cols = len(df.axes[1])
print("Number of Rows    : ", rows)
print("Number of Columns : ", cols)

#Defining our function for estimating missing values by mean
def miss_val_esti_mean(updated_df):    
    updated_df['patient.stage_event.pathologic_stage']=updated_df['patient.stage_event.pathologic_stage'].fillna(updated_df['patient.stage_event.pathologic_stage'].mean())
    updated_df['patient.tissue_prospective_collection_indicator']=updated_df['patient.tissue_prospective_collection_indicator'].fillna(updated_df['patient.tissue_prospective_collection_indicator'].mean())
    updated_df['patient.race_list.race']=updated_df['patient.race_list.race'].fillna(updated_df['patient.race_list.race'].mean())
    updated_df['patient.lymph_node_examined_count']=updated_df['patient.lymph_node_examined_count'].fillna(updated_df['patient.lymph_node_examined_count'].mean())
    updated_df['patient.margin_status']=updated_df['patient.margin_status'].fillna(updated_df['patient.margin_status'].mean())
    updated_df['patient.menopause_status']=updated_df['patient.menopause_status'].fillna(updated_df['patient.menopause_status'].mean())
    updated_df['patient.number_of_lymphnodes_positive_by_he']=updated_df['patient.number_of_lymphnodes_positive_by_he'].fillna(updated_df['patient.number_of_lymphnodes_positive_by_he'].mean())
    updated_df['patient.other_dx']=updated_df['patient.other_dx'].fillna(updated_df['patient.other_dx'].mean())
    updated_df['patient.person_neoplasm_cancer_status']=updated_df['patient.person_neoplasm_cancer_status'].fillna(updated_df['patient.person_neoplasm_cancer_status'].mean())
    updated_df['patient.axillary_lymph_node_stage_method_type']=updated_df['patient.axillary_lymph_node_stage_method_type'].fillna(updated_df['patient.axillary_lymph_node_stage_method_type'].mean())
    updated_df['patient.breast_carcinoma_estrogen_receptor_status']=updated_df['patient.breast_carcinoma_estrogen_receptor_status'].fillna(updated_df['patient.breast_carcinoma_estrogen_receptor_status'].mean())
    updated_df['patient.breast_carcinoma_progesterone_receptor_status']=updated_df['patient.breast_carcinoma_progesterone_receptor_status'].fillna(updated_df['patient.breast_carcinoma_progesterone_receptor_status'].mean())
    updated_df['patient.breast_carcinoma_surgical_procedure_name']=updated_df['patient.breast_carcinoma_surgical_procedure_name'].fillna(updated_df['patient.breast_carcinoma_surgical_procedure_name'].mean())
    updated_df['patient.histological_type']=updated_df['patient.histological_type'].fillna(updated_df['patient.histological_type'].mean())
    updated_df['patient.history_of_neoadjuvant_treatment']=updated_df['patient.history_of_neoadjuvant_treatment'].fillna(updated_df['patient.history_of_neoadjuvant_treatment'].mean())
    updated_df['patient.initial_pathologic_diagnosis_method']=updated_df['patient.initial_pathologic_diagnosis_method'].fillna(updated_df['patient.initial_pathologic_diagnosis_method'].mean())
    updated_df['patient.lab_proc_her2_neu_immunohistochemistry_receptor_status']=updated_df['patient.lab_proc_her2_neu_immunohistochemistry_receptor_status'].fillna(updated_df['patient.lab_proc_her2_neu_immunohistochemistry_receptor_status'].mean())
    return updated_df  

#Call fun miss_val_esti_mean() to fill missing values in datasets
df = miss_val_esti_mean(df)

#standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# statistics of scaled data
#my_pnda.DataFrame(df_scaled).describe()

#Normalizing the datasets
udf_scaled = normalize(df_scaled)
udf_scaled = my_pnds.DataFrame(udf_scaled, columns=df.columns)
udf_scaled.head()

#-----------------For verifying correctness of the output---------STARTS HERE------
#import scipy.cluster.hierarchy as shc
#mypltlib.figure(figsize=(10, 7)) 
#mypltlib.title("Hierarchical Agglomerative Clustering Dendrogram") 
#dend = shc.dendrogram(shc.linkage(udf_scaled, method='ward'))  
#mypltlib.axhline(y=3, color='r', linestyle='--')
#We have 3 clusters as this line cuts the dendrogram at 3 points. Let’s now apply hierarchical agglomerative 
#clustering for 3 clusters from sklearn.cluster import AgglomerativeClustering
#cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')  
#cluster.fit_predict(udf_scaled)
#-----------------For verifying correctness of the output---------ENDS HERE------

#Implementing our Hierarchical Agglomerative Clustering
class Dist_comp_grd:
    def __init__(self):
        pass    
    
    def comp_dist(self,samples):            
        Dist_matx = my_npy.zeros((len(samples),len(samples)))
        for i in range(Dist_matx.shape[0]):
            for j in range(Dist_matx.shape[0]):
                if i!=j:
                    Dist_matx[i,j] = float(self.dist_calc(samples[i],samples[j]))
                else:
                    Dist_matx[i,j] = 10**5
                    
        return Dist_matx    
    
    def dist_calc(self,sample1,sample2):       
        dist = []
        #dist = dist.astype(np.int32)
        for i in range(len(sample1)):
            for j in range(len(sample2)):
                try:
                    #dist.append(my_npy.linalg.norm(my_npy.array(sample1[i])-my_npy.array(sample2[j])),dtype="object")
                    dist.append(my_npy.linalg.norm(my_npy.array(sample1[i])-my_npy.array(sample2[j])))  
                except:
                    dist.append(self.intersampledist(sample1[i],sample2[j]))
                    
        return min(dist)
    
    def inter_cluster_dist(self,cl,sample):
        if sample[0]!='<class \'list\'>':
            sample = [sample]
        dist   = []
        for i in range(len(cl)):
            for j in range(len(sample)):
                dist.append(my_npy.linalg.norm(my_npy.array(cl[i])-my_npy.array(sample[j])))
                
        return min(dist)

#Starting the process of clustering here
X = df.iloc[0:rows, 0: cols].values
processeing = [[i] for i in range(X.shape[0])]
samples = [[list(X[i])] for i in range(X.shape[0])]
m = len(samples)
distcal  = Dist_comp_grd()

#Iterates untill left with only one cluster
while m>1:
    print('Sample_Size_Before_Clustering    :',m)
    Dist_matx = distcal.comp_dist(samples)
    required_sample_ind = my_npy.where(Dist_matx==Dist_matx.min())[0]
    value_to_add = samples.pop(required_sample_ind[1])
    samples[required_sample_ind[0]].append(value_to_add)    
    print('Cluster_NODE_1                   :',processeing[required_sample_ind[0]])
    print('Cluster_NODE_2                   :',processeing[required_sample_ind[1]])    
    processeing[required_sample_ind[0]].append(processeing[required_sample_ind[1]])
    processeing[required_sample_ind[0]] = [processeing[required_sample_ind[0]]]
    v = processeing.pop(required_sample_ind[1])
    m = len(samples)    
    print('Processeing_With_Current_Sample  :',processeing)
    print('Cluster_Obtained                 :',processeing[required_sample_ind[0]])
    print('Sample_Size_After_Clustering     :',m)
    print('\n')
    
#Showing results of clustering using dendrogram
D = linkage(X, 'single')
mypltlib.figure(figsize=(15, 10))
mypltlib.title("Hierarchical Agglomerative Clustering Dendrogram")  
mypltlib.axhline(y=3, color='r', linestyle='--')
dn = dendrogram(D)

Number of Rows    :  1097
Number of Columns :  24
Sample_Size_Before_Clustering    : 1097
Cluster_NODE_1                   : [194]
Cluster_NODE_2                   : [247]
Processeing_With_Current_Sample  : [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95], [96], [97], [98], [99], [100], [101], [102], [103], [104], [105], [106], [107], [108], [109], [110], [111], [112], [113], [114], [115], [116], [117], [118], [119], [120], [121], [122], [123], [124], [125], [126], [127], [128],

Cluster_NODE_1                   : [908]
Cluster_NODE_2                   : [917]
Processeing_With_Current_Sample  : [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95], [96], [97], [98], [99], [100], [101], [102], [103], [104], [105], [106], [107], [108], [109], [110], [111], [112], [113], [114], [115], [116], [117], [118], [119], [120], [121], [122], [123], [124], [125], [126], [127], [128], [129], [130], [131], [132], [133], [134], [135], [136], [137], [138], [139], [140], [141]

  dist.append(my_npy.linalg.norm(my_npy.array(sample1[i])-my_npy.array(sample2[j])))


AttributeError: 'Dist_comp_grd' object has no attribute 'intersampledist'