# About
Implement clustering algorithms
1. K-Means
1. Heirarchical clustering
1. DBSCAN

## The Team
| Name| Student ID|
|------------|---------------|
|Cynthia Cai | 5625483 |
|Pratyush Kumar | 5359252|


# Imports

// add the imports to the cell below

In [1]:
import numpy as np 
import pandas as pd
import scipy.spatial
from scipy.spatial import ConvexHull, distance_matrix
from sklearn.metrics.pairwise import euclidean_distances as eucDist
import glob
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid")

# Reading the dataset


From the readme for the xyz files, we know that:

Ground truth labels:
|File range|Label|
|--|--|
|    000 - 099: |building|
|    100 - 199: |car|
|    200 - 299: |fence|
|    300 - 399: |pole|
|    400 - 499: |tree|


workflow:

iterate through the files, and collect them in a dataframe

Use [this link](https://pandas.pydata.org/docs/reference/api/pandas.concat.html#pandas.concat) for concatenating the dataframes

In [2]:
xyzPath = './scene_objects/data/*.xyz'

dataPathsList = glob.glob(xyzPath)

In [3]:
allPointsDF= pd.DataFrame(columns=['x','y','z', 'fileNo', 'groundLabel'])
# featureDF = pd.DataFrame(columns=['Label' , 'convHull', median] )

def df_maker(df1, df2):
    return pd.concat([df1, df2], sort=False, ignore_index=True)

labelToGive = None
for path in dataPathsList:
    indx = int(path.split('\\')[-1][0:3])
    # if else to determine label
    if indx>=0 and indx<100:
        labelToGive = 'building' 
    elif indx>=100 and indx<200:
        labelToGive = 'car' 
    elif indx>=200 and indx<300:
        labelToGive = 'fence' 
    elif indx>=300 and indx<400:
        labelToGive = 'pole' 
    elif indx>=400 and indx<500:
        labelToGive = 'tree' 

    # print(indx, labelToGive)        

    # using pandas to read dataset and make a dataFrame
    tempDF = pd.read_csv(path, delimiter=' ', header=None, dtype=np.float64, names=['x','y','z'])
    tempDF.loc[:,'fileNo'] = indx
    tempDF.loc[:,'groundLabel'] = labelToGive

    # merge with megaDFofPoints
    allPointsDF = df_maker(allPointsDF, tempDF)

# allPointsDF.head()

In [4]:
# save to pickle file
# allPointsDF.to_pickle('./scene_objects/compressedData.pkl')

## Making feature points
Identified feature points: `//add more`
* median height(z)
* convex hull

In [4]:
def label_determiner(indx):
    labelToGive=None
    if indx>=0 and indx<100:
        labelToGive = 'building' 
    elif indx>=100 and indx<200:
        labelToGive = 'car' 
    elif indx>=200 and indx<300:
        labelToGive = 'fence' 
    elif indx>=300 and indx<400:
        labelToGive = 'pole' 
    elif indx>=400 and indx<500:
        labelToGive = 'tree' 
    return labelToGive


featureDF = allPointsDF.groupby('fileNo').var()
featureDF.rename(columns={'x':'varX','y':'varY','z':'varZ'}, inplace=True)
featureDF.loc[:,'median_Z'] = allPointsDF.groupby('fileNo').z.median()
# featureDF.loc[:,'mean_Z'] = allPointsDF.groupby('fileNo').z.mean()

# range of x,y,z
featureDF.loc[:,'range_X'] = allPointsDF.groupby('fileNo').x.max() - allPointsDF.groupby('fileNo').x.min()
featureDF.loc[:,'range_Y'] = allPointsDF.groupby('fileNo').y.max() - allPointsDF.groupby('fileNo').y.min()
featureDF.loc[:,'range_Z'] = allPointsDF.groupby('fileNo').z.max() - allPointsDF.groupby('fileNo').z.min()

featureDF.loc[:,'Volume'] = allPointsDF.set_index('fileNo').loc[:,'x':'z'].groupby('fileNo').apply(ConvexHull).apply(lambda x: x.volume)

# points density
featureDF.loc[:,'footprintDensity'] =  allPointsDF.groupby('fileNo').count().x / (featureDF.range_X * featureDF.range_Y)
featureDF.loc[:,'volumeDensity'] =  allPointsDF.groupby('fileNo').count().x / featureDF.Volume

featureDF.loc[:,'label'] = featureDF.reset_index().fileNo.apply(label_determiner)

# standardize DF
standardFeatureDF = (featureDF.iloc[:,:-1] - featureDF.iloc[:,:-1].mean() ) / featureDF.iloc[:,:-1].std()

# join labels to the feature DF
standardFeatureDF = standardFeatureDF.join(other=featureDF.label ,on='fileNo')

featureDF.to_pickle('./scene_objects/featureData.pkl')
standardFeatureDF.to_pickle('./scene_objects/standardFeatureData.pkl')

featureDF

Unnamed: 0_level_0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
fileNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,9.024868,1.760537,0.501052,17.92,10.480000,4.649994,5.02,104.189009,37.121449,17.362676,building
1,7.306054,2.628307,0.646622,7.53,10.540009,6.139984,3.60,137.366395,32.078876,15.112867,building
2,19.973520,18.707730,2.108935,13.35,17.039997,16.059998,7.49,1247.880682,32.711848,7.173763,building
3,27.224888,16.674539,2.437923,14.43,21.160004,16.750000,7.07,1326.712538,29.001490,7.747722,building
4,30.802399,22.456995,0.597981,7.80,23.579994,22.090012,5.21,1100.901866,23.277809,11.013697,building
...,...,...,...,...,...,...,...,...,...,...,...
495,2.468837,1.670906,2.625734,7.14,6.969986,5.820001,7.19,138.917876,67.520725,19.716685,tree
496,6.586047,4.938702,7.407274,9.84,10.640015,10.010002,12.44,645.231457,62.822416,10.369922,tree
497,1.379584,2.840469,2.370287,7.38,5.279999,6.920013,6.84,84.578072,31.994384,13.821549,tree
498,0.921677,5.734172,7.069670,12.82,4.089996,9.860001,11.28,200.910666,49.321346,9.899922,tree


### Plotting to see resemblamces and clusters, if any
needed: seaborn

In [5]:
# load df's
featureDF = pd.read_pickle('./scene_objects/featureData.pkl')
standardFeatureDF = pd.read_pickle('./scene_objects/standardFeatureData.pkl')

In [None]:
sns.pairplot(data=featureDF, hue="label")

normalize the feature df </br>
[from stackoverflow we see](https://stackoverflow.com/questions/26414913/normalize-columns-of-pandas-data-frame), that we can just use pandas for a standard scaling, or else, a [standard scaler from sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) can also be applied </br>

from [answer here](https://stats.stackexchange.com/questions/417339/data-standardization-vs-normalization-for-clustering-analysis), we see that standard scaler is used for k means , so we are going with that

In [None]:
sns.pairplot(data=standardFeatureDF, hue="label")

# Clustering Algorithms
note: already loaded the featureDF and standardised in the cell above

## K-Means clustering

In [101]:
def kmeans(featureDF, k):
    """
    Using DBScan method to cluster feature points.
    Input parameter:
        featureDF: a DataFrame that stores feature points with label
        k: the number of clusters
    Output:
        cluster: a list of dataframes
        C = [C0,C1,C2,C3,C4]
        Cx is a DataFrame with the same column of featureDF
    """

    #pre-step: setting parameters and format conversion
    MaxInteration, epsilon = 50, 0.0000000000000000000000000000000000000000000000000000000000001

    column_name = standardFeatureDF.columns.values.tolist()
    col_num = len(column_name) # the number of features

    pts_label_array = standardFeatureDF.to_numpy() # has 500 elements
    pts_array = standardFeatureDF.loc[:,'varX':'volumeDensity'].to_numpy()

    cluster = [pd.DataFrame(columns = column_name)] * k # a list of dataframes(cluster)
    # Step1: initialize k centroids (array)
    centroid_idx = np.random.randint(0,len(pts_array),k).tolist()
    centroid_array = pts_array[centroid_idx]

    # step2: assign each point to a cluster
    interation = 0
    centroid_change = []
    while interation <= MaxInteration:
        interation += 1

        cls_0, cls_1, cls_2, cls_3, cls_4 = [], [], [], [], []

        tree = scipy.spatial.cKDTree(centroid_array)
        for j in range(0,len(pts_array)):
            pt = pts_array[j]
            d, i = tree.query(pt) # i is the index of centroid
            
            if i == 0:
                cls_0.append(j)
            elif i == 1:
                cls_1.append(j)
            elif i == 2:
                cls_2.append(j)
            elif i == 3:
                cls_3.append(j)
            elif i == 4:
                cls_4.append(j)
            
        cluster = []
        cluster.append(standardFeatureDF.iloc[cls_0])
        cluster.append(standardFeatureDF.iloc[cls_1])
        cluster.append(standardFeatureDF.iloc[cls_2])
        cluster.append(standardFeatureDF.iloc[cls_3])
        cluster.append(standardFeatureDF.iloc[cls_4])

        # step3: update centroids 更新均值向量
        old_centroid_array = centroid_array
        #print(interation," centroid",old_centroid_array)
        for i in range(0,k):
            centroid_array[i] = cluster[i].mean(axis = 0).to_numpy() # the input is a list of arrays
            #print(centroid_array[i])
            dist = scipy.spatial.distance.euclidean(old_centroid_array[i], centroid_array[i])
            if dist <= epsilon:
                break
    
    print("Iteration:", interation)
    return(cluster)


In [104]:
kmeans_cls = kmeans(standardFeatureDF, 5)

  centroid_array[i] = cluster[i].mean(axis = 0).to_numpy() # the input is a list of arrays


Iteration: 51


In [106]:
kmeans_cls[0]

Unnamed: 0_level_0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
fileNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
33,-0.140919,-0.147776,-0.182274,2.169188,-0.218046,-0.153202,0.218281,-0.107749,0.409041,-0.630501,building
97,-0.059681,0.030607,-0.098544,2.239731,0.310777,0.475769,0.257681,0.231299,0.193707,-0.592124,building
300,-0.165002,-0.216161,0.714719,0.529665,-0.100528,-0.468210,1.076311,-0.237604,-1.314996,-0.957978,pole
301,-0.179199,-0.230730,1.630480,1.361589,-0.664605,-0.576689,1.459360,-0.283353,-0.110540,-0.680980,pole
302,-0.167816,-0.216766,0.498645,0.485880,-0.586263,-0.586076,1.141976,-0.279513,-0.923486,-0.867523,pole
...,...,...,...,...,...,...,...,...,...,...,...
488,-0.095497,-0.102528,0.699974,0.245060,0.270037,0.423616,1.803010,0.408979,1.476428,-0.491798,tree
491,-0.141534,-0.155401,2.185103,0.838596,-0.073109,0.128425,2.242968,0.202951,1.819516,-0.579737,tree
492,-0.149920,-0.171435,0.510802,-0.105224,-0.140485,-0.005085,0.798327,-0.093082,0.481012,-0.503497,tree
496,-0.124728,-0.144448,0.770711,-0.005490,-0.008866,0.102351,1.430905,0.180814,2.364991,-0.440335,tree


In [107]:
kmeans_cls[1]

Unnamed: 0_level_0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
fileNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,-0.118543,-0.186223,-0.463418,-0.567404,-0.016701,-0.301320,-0.504040,-0.201540,0.446179,-0.151984,building
12,-0.124613,-0.192550,-0.492498,-0.674435,-0.107581,-0.332611,-0.346443,-0.198423,0.613379,-0.252765,building
18,-0.169340,-0.159683,-0.529466,-0.715788,-0.494600,-0.132341,-0.652882,-0.257462,0.469259,0.006584,building
25,-0.123659,-0.066677,-0.535565,-0.127116,-0.075461,0.342256,-0.827990,-0.196947,0.226089,0.387381,building
27,-0.137809,-0.107130,-0.561964,0.398309,-0.160071,0.178497,-0.628805,-0.198760,-0.092141,-0.125181,building
...,...,...,...,...,...,...,...,...,...,...,...
493,-0.154910,-0.178204,-0.230499,-0.648893,-0.196894,-0.079144,0.824593,-0.140897,1.116610,-0.256698,tree
494,-0.166394,-0.209862,-0.282211,-1.173102,-0.375518,-0.396238,0.135105,-0.241307,1.555613,0.046677,tree
495,-0.160097,-0.203533,-0.102139,-0.662272,-0.296391,-0.334697,0.281758,-0.200372,2.658229,0.127909,tree
497,-0.169454,-0.182386,-0.148770,-0.603891,-0.428791,-0.219957,0.205148,-0.241283,0.440906,-0.230490,tree


In [108]:
kmeans_cls[2]

Unnamed: 0_level_0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
fileNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6,-0.113495,-0.118877,-0.549384,-0.560106,-0.064492,-0.013431,-0.928677,-0.245457,0.729352,1.418477,building
65,-0.149758,-0.118020,-0.554631,-0.599026,-0.316758,-0.011346,-0.963699,-0.266881,0.763156,1.602550,building
100,-0.165070,-0.228047,-0.570862,0.967520,-0.446027,-0.693513,-1.018420,-0.300700,-0.778581,0.541225,car
101,-0.178768,-0.198209,-0.568329,0.517503,-0.661473,-0.388938,-0.961510,-0.299167,0.075355,1.457934,car
102,-0.162454,-0.226749,-0.555714,1.091579,-0.424874,-0.692472,-0.922111,-0.296234,0.408744,1.032873,car
...,...,...,...,...,...,...,...,...,...,...,...
243,-0.158055,-0.233331,-0.562280,-1.456492,-0.379434,-0.847888,-0.976832,-0.302716,-0.241752,1.215194,fence
254,-0.117034,-0.206911,-0.568059,-1.128101,-0.122466,-0.493243,-0.902411,-0.300771,-1.268524,0.918189,fence
255,-0.119553,-0.150631,-0.572028,-0.951743,-0.097396,-0.143815,-1.027175,-0.299791,-1.373243,0.815824,fence
268,-0.130296,-0.233366,-0.567330,-1.117154,-0.184356,-0.857276,-0.970265,-0.301264,0.058483,1.109442,fence


In [109]:
kmeans_cls[3]

Unnamed: 0_level_0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
fileNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,-0.103778,-0.201913,-0.489991,1.959990,-0.021402,-0.456737,-0.193223,-0.226519,0.760904,-0.015205,building
2,-0.009724,0.104511,-0.196479,0.848326,0.492535,0.733408,0.347423,0.634529,0.485685,-0.634648,building
3,0.052568,0.067749,-0.136423,1.111039,0.815313,0.805380,0.255491,0.693879,0.254109,-0.599754,building
4,0.083300,0.172302,-0.472297,-0.501725,1.004905,1.362382,-0.151635,0.523874,-0.103126,-0.401196,building
5,-0.142520,-0.003433,-0.509320,0.597776,-0.167121,0.541483,-0.482151,-0.128204,0.167088,-0.194485,building
...,...,...,...,...,...,...,...,...,...,...,...
453,-0.117948,-0.188763,-0.036365,-0.251175,0.028738,-0.087488,0.870559,-0.063914,0.635660,-0.463527,tree
469,-0.116874,-0.145643,-0.031361,0.697510,0.029520,0.213958,0.557553,0.017690,0.806041,-0.408720,tree
476,-0.159128,-0.172558,0.119175,-0.110089,-0.211778,-0.004044,1.023778,-0.115347,0.846751,-0.398260,tree
477,-0.157005,-0.185252,-0.153279,0.415337,-0.214912,-0.112523,0.553175,-0.153384,1.022719,-0.276300,tree


In [110]:
kmeans_cls[4]

Unnamed: 0_level_0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
fileNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
200,-0.091978,-0.232055,-0.566039,-1.1804,0.101597,-0.730021,-0.913355,-0.2944,-1.038021,-0.190796,fence
202,-0.180481,0.448594,-0.559859,-0.552808,-0.683409,1.100571,-0.749191,-0.280108,-1.01581,-0.437193,fence
203,-0.180542,0.040656,-0.559776,-0.68903,-0.684977,0.370418,-0.992154,-0.294163,-0.8994,0.056923,fence
209,0.49037,-0.233015,-0.554373,-1.158507,1.545481,-0.833285,-0.563139,-0.269327,-0.293893,-0.247385,fence
210,0.487152,-0.229255,-0.540533,-0.810656,1.905863,-0.501589,-0.652882,-0.212951,-1.208993,-0.661364,fence
218,0.199425,-0.233493,-0.569129,-0.736464,0.773007,-0.838501,-0.985587,-0.297396,-0.791614,0.442141,fence
221,0.028202,-0.229783,-0.562797,-0.791196,0.358566,-0.672653,-0.981209,-0.283504,-0.945273,-0.245191,fence
223,-0.002667,-0.231001,-0.558463,-0.555241,0.123535,-0.750883,-1.003098,-0.293702,-1.149359,-0.4731,fence
225,0.047769,-0.232935,-0.5581,-1.149993,0.713467,-0.738367,-0.965888,-0.288269,-1.143396,-0.368709,fence
227,-0.178443,0.569802,-0.552953,-0.985798,-0.25095,1.388457,-0.887089,-0.234366,-1.397599,-0.793276,fence


## Heirarchical clustering

This [ref was nice](https://www.section.io/engineering-education/hierarchical-clustering-in-python/) for heirarchical clustering understanding
Some other sources:
* [Statquest](https://www.youtube.com/watch?v=7xHsRkOdVwo&ab_channel=StatQuestwithJoshStarmer)
* Penn state [pseudo code](https://online.stat.psu.edu/stat508/lesson/12/12.7)
* pseudo code from [researchgate](https://www.researchgate.net/figure/The-hierarchical-clustering-algorithm-in-pseudocode_fig1_202144697)
* towards data science article to do [step by step](https://towardsdatascience.com/breaking-down-the-agglomerative-clustering-process-1c367f74c7c2) {this is a good one to follow}
* another one [for theory](https://towardsdatascience.com/machine-learning-algorithms-part-12-hierarchical-agglomerative-clustering-example-in-python-1e18e0075019)
* similar [theory as above](https://www.geeksforgeeks.org/ml-hierarchical-clustering-agglomerative-and-divisive-clustering/)
* real good [step by step explaination](https://medium.com/@darkprogrammerpb/agglomerative-hierarchial-clustering-from-scratch-ec50e14c3826), also the [github code](https://github.com/Darkprogrammerpb/DeepLearningProjects/blob/master/Project40/agglomerative_hierarchial_clustering/Hierarchial%20Agglomerative%20clustering.ipynb)

### To Think in heirarchical clustering:
* Which type of heirarchical clustering are we doing: lets begin with agglomerative clustering
* Within the selected type what distance metrics are we using


In [None]:

tempDF = standardFeatureDF.iloc[:,:-1].copy()

def heirarch_clust(dataDF):
    distances = eucDist(standardFeatureDF.drop('label', axis=1))
    
    pass


# calculate distances
# maybe change the distance computation
distMatDF = pd.DataFrame( distance_matrix(tempDF.values, tempDF.values), index = tempDF.index, columns = tempDF.index)
# distMatDF = pd.DataFrame( np.tril(distMatDF),  index = tempDF.index, columns = tempDF.index)
distMatDF = distMatDF.where(distMatDF!=0, np.nan)
distMatDF


devise new distance matrix and then repeat the sequence:
### TODO: 
* linkage between the clusters
* updation of the distance matrix

clusters to be made:
`vals.idxmin()` and `idVals.iloc[vals.idxmin()]`

In [4]:
tempDF = standardFeatureDF.iloc[:,:-1].copy()

distMatDF = pd.DataFrame( distance_matrix(tempDF.values, tempDF.values), index = tempDF.index, columns = tempDF.index)
# distMatDF = pd.DataFrame( np.tril(distMatDF),  index = tempDF.index, columns = tempDF.index)
# replace 0 distances with np.nan
distMatDF = distMatDF.where(distMatDF!=0, np.nan)
    
clusterKeeper = {}
clustDict={}
clusterKeeperList = []
clustCheck = {}
# clustCHECK WILL have two nodes each
iterationCounter=0
play=[]
m=len(distMatDF)
progression = [ [i] for i in range(m) ] 

while m>1: 

    # cluster size
    # print(f"Total sample = {m}")
    # compute distances

    # get indices with min dist
    vals = distMatDF.min(skipna=True)
    idVals = distMatDF.idxmin(skipna=True)

    # print(vals.min(), vals.idxmin()) # GIVES US THE MINIMUM VALUE and the index at which this was found in the vals series
    # print(idVals.iloc[vals.idxmin()])
    
    ind_to_pop = [idVals.loc[vals.idxmin()] , vals.idxmin()]
    # print(f"index {ind_to_pop}")
    play.append(ind_to_pop)
    # update distmatrix at some point
    # add updated new row, col to dist mat  
    # this updated row is basically the minimum of the two eliminated rows
    singleLink_minRow = distMatDF.loc[ind_to_pop].drop(ind_to_pop, axis=1).max()
    singleLink_minRow.rename(f"cluster {iterationCounter}", inplace=True)

    # pop row and col from dist mat
    distMatDF = distMatDF.drop(ind_to_pop, axis=0).drop(ind_to_pop, axis=1)
    # print("row,col ",len(distMatDF),len(distMatDF.columns))

    # min distance from other points

    distMatDF = distMatDF.append(singleLink_minRow)
    distMatDF.loc[:,singleLink_minRow.name] = singleLink_minRow
    # update value of m
    m = len(distMatDF)
    # m-=1
    clusterKeeper[f"iteration {iterationCounter}"] = {'indices_popped':ind_to_pop , "df":distMatDF.copy()}
    clusterKeeperList.append( (iterationCounter, ind_to_pop) )
    clustDict[f"cluster {iterationCounter}"] = ind_to_pop
    
    indPop1, indPop2 = ind_to_pop

    clustCheck[f"cluster {iterationCounter}"] = {'node1':indPop1 , "node2":indPop2, 'fullnodes':ind_to_pop}
    print("before" , clustCheck[f'cluster {iterationCounter}'])
    
    # Case: if first index is a cluster
    if (indPop1 in clustCheck.keys()) and (indPop2 in clustCheck.keys()): #both are clusters
        clustCheck[f"cluster {iterationCounter}"] = {'node1':clustCheck[indPop1]['fullnodes'].copy() , "node2":clustCheck[indPop2]['fullnodes'].copy() }
        tempFull = clustCheck[f"cluster {iterationCounter}"]["node1"].copy()
        # try:
        tempFull.append(clustCheck[f"cluster {iterationCounter}"]["node2"].copy()) #if it is a list
        # except:
        #     tempFull.append(clustCheck[f"cluster {iterationCounter}"]["node2"]) # if it isnt a list and thus can't be copied
        clustCheck[f"cluster {iterationCounter}"]["fullnodes"] = tempFull  


    # Case: if first index is a cluster
    elif indPop1 in clustCheck.keys(): #means first position is cluster
        clustCheck[f"cluster {iterationCounter}"] = {'node1':clustCheck[indPop1]['fullnodes'].copy() , "node2":indPop2 }
        tempFull = clustCheck[f"cluster {iterationCounter}"]["node1"].copy()
        try:
            tempFull.append(clustCheck[f"cluster {iterationCounter}"]["node2"].copy()) #if it is a list
        except:
            tempFull.append(clustCheck[f"cluster {iterationCounter}"]["node2"]) # if it isnt a list and thus can't be copied
        clustCheck[f"cluster {iterationCounter}"]["fullnodes"] = tempFull

    # Case: if second index is a cluster
    elif indPop2 in clustCheck.keys(): #means first position is cluster
        clustCheck[f"cluster {iterationCounter}"] = {'node1':indPop1 , "node2":clustCheck[indPop2]['fullnodes'].copy()}
        tempFull = clustCheck[f"cluster {iterationCounter}"]["node1"].copy()
        try:
            tempFull.append(clustCheck[f"cluster {iterationCounter}"]["node2"].copy())
        except:
            tempFull.append(clustCheck[f"cluster {iterationCounter}"]["node2"])

        clustCheck[f"cluster {iterationCounter}"]["fullnodes"] =  tempFull

    print("after" , clustCheck[f'cluster {iterationCounter}'])

    iterationCounter+=1
distMatDF

before {'node1': 151, 'node2': 142, 'fullnodes': [151, 142]}
after {'node1': 151, 'node2': 142, 'fullnodes': [151, 142]}
before {'node1': 198, 'node2': 123, 'fullnodes': [198, 123]}
after {'node1': 198, 'node2': 123, 'fullnodes': [198, 123]}
before {'node1': 160, 'node2': 136, 'fullnodes': [160, 136]}
after {'node1': 160, 'node2': 136, 'fullnodes': [160, 136]}
before {'node1': 389, 'node2': 370, 'fullnodes': [389, 370]}
after {'node1': 389, 'node2': 370, 'fullnodes': [389, 370]}
before {'node1': 148, 'node2': 135, 'fullnodes': [148, 135]}
after {'node1': 148, 'node2': 135, 'fullnodes': [148, 135]}
before {'node1': 175, 'node2': 167, 'fullnodes': [175, 167]}
after {'node1': 175, 'node2': 167, 'fullnodes': [175, 167]}
before {'node1': 96, 'node2': 83, 'fullnodes': [96, 83]}
after {'node1': 96, 'node2': 83, 'fullnodes': [96, 83]}
before {'node1': 194, 'node2': 103, 'fullnodes': [194, 103]}
after {'node1': 194, 'node2': 103, 'fullnodes': [194, 103]}
before {'node1': 185, 'node2': 104, 'ful

fileNo,cluster 498
fileNo,Unnamed: 1_level_1
cluster 498,


## DBSCAN

In [84]:
import random

In [87]:
def dbscan(featureDF, radius, MinPts):
    """
    Using DBScan method to cluster feature points.
    Input parameter:
        featureDF: a DataFrame that stores feature points with label
        radius: the radius of search circle 
        MinPts: the minimum number of neighbor points required for a core point
    Output:
        cluster: a list of dataframes
        C = [C0,C1,C2,C3,C4]
        Cx is a DataFrame with the same column of featureDF
    """
    # pre-step: data structure and kdtree
    column_name = featureDF.columns.values.tolist()
    pts_label_list = featureDF.to_numpy().tolist()

    pts_DF = featureDF.loc[:,'varX':'volumeDensity']
    pts_array = pts_DF.to_numpy() # an array of arrays (all pts)

    tree = scipy.spatial.cKDTree(pts_array)

    # step1: generate core points
    wait_idx = [*range(0,len(pts_DF))] # a list index (wait pts)
    core_idx = [] # a list of index (core pts)
    for i in wait_idx:
        pt = pts_array[i]
        neighbor = tree.query_ball_point(pt, radius) # a list of indexs of neighbors
        if len(neighbor) >= MinPts:
            core_idx.append(i) 

    # step2: pick a random core point and start the loop (using queue)
    # initialize cluster number and waiting list
    k = 0
    cluster = []

    while len(core_idx) > 0:
        clusterDF = pd.DataFrame(columns=column_name)
        
        start_idx = random.sample(core_idx,1)[0]
        queue = [start_idx]
        core_idx.remove(start_idx)
        wait_idx.remove(start_idx)
        clusterDF.append(pts_label_list[start_idx])

        # step3: process each pt in queue
        # first of all: core or non-core?
        # for a non-core point, assign it to a cluster (done)
        # for a core neighbor point, assign it to a cluster (done) and add its neighbors to queue
        while len(queue) > 0:
            pt = pts_array[queue.pop(0)] 
            neighbor_idx = tree.query_ball_point(pt, radius) # a list of indexs (neighbor pts)       
            
            if len(neighbor_idx) >= MinPts:
                for index in neighbor_idx:
                    if index in wait_idx: # 将 在邻域中 & 未处理 的点 
                        queue.append(index) # 加入队列
                        wait_idx.remove(index) # 移除出 等待集
                        clusterDF.loc[len(clusterDF.index)] = pts_label_list[index] # 加入 聚类簇
                        if index in core_idx:
                            core_idx.remove(index) # 移除出 核心集
        
        # step4: store the new cluster
        k = k + 1
        cluster.append(clusterDF)

        if k > 10:
            break
            print("DBScan clusters out of ten!")

    return(cluster)

In [91]:
dbscan_cluster = dbscan(standardFeatureDF,0.8,4)

len(dbscan_cluster)

5

In [92]:
dbscan_cluster[0]

Unnamed: 0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
0,-0.157509,-0.228395,-0.552459,-0.562538,-0.381000,-0.723763,-0.935244,-0.297704,0.223630,1.143596,car
1,-0.163223,-0.227765,-0.561238,-0.095494,-0.409989,-0.718545,-0.968076,-0.297533,0.409491,1.222249,car
2,-0.162091,-0.226152,-0.569303,-1.066071,-0.435058,-0.677867,-1.007476,-0.298819,0.170877,1.642588,car
3,-0.161111,-0.227507,-0.551441,-1.049044,-0.381001,-0.699771,-0.950565,-0.297975,0.207060,1.459098,car
4,-0.179574,-0.211453,-0.562673,-1.034448,-0.692811,-0.532882,-0.970265,-0.301543,0.036216,1.488650,car
...,...,...,...,...,...,...,...,...,...,...,...
310,0.025439,0.128938,-0.331429,1.597544,0.559128,0.971228,0.347423,0.626987,0.333725,-0.582889,building
311,0.113950,0.080148,-0.157371,1.765388,0.772224,0.816853,0.323346,0.742426,0.368063,-0.602664,building
312,-0.138919,-0.161494,1.005297,0.808190,0.009152,0.104436,2.175114,0.247970,1.843317,-0.579239,tree
313,-0.095497,-0.102528,0.699974,0.245060,0.270037,0.423616,1.803010,0.408979,1.476428,-0.491798,tree


In [93]:
dbscan_cluster[1]

Unnamed: 0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
0,-0.162153,-0.219061,1.360613,0.855624,-0.051172,-0.48177,1.422149,-0.202855,-1.173432,-0.948402,pole
1,-0.130633,-0.221173,1.410574,1.382266,0.154871,-0.488027,1.55348,-0.170876,-1.071325,-0.923996,pole
2,-0.144408,-0.220098,1.260773,1.46862,0.061643,-0.510976,1.450604,-0.183992,-1.037393,-0.920945,pole
3,-0.178908,-0.225196,1.103379,1.264288,-0.544741,-0.462995,0.936224,-0.260755,-0.876087,-0.874048,pole
4,-0.141468,-0.227802,1.235836,1.329966,0.059293,-0.700814,1.465926,-0.222026,-0.713254,-0.872649,pole
5,-0.164046,-0.202627,1.507739,1.115904,-0.32381,-0.59129,1.006267,-0.251589,-1.073353,-0.923273,pole
6,-0.17121,-0.22448,1.339514,0.201274,0.143904,-0.510975,1.082877,-0.207287,-1.381927,-1.002833,pole
7,-0.16603,-0.227921,1.177308,0.489529,-0.181224,-0.503676,0.977812,-0.234713,-1.29539,-0.974348,pole
8,-0.177677,-0.22695,1.043643,0.824001,-0.511052,-0.469253,1.126654,-0.268021,-1.067402,-0.88491,pole
9,-0.145583,-0.228541,1.424821,0.59656,0.066341,-0.612153,1.457171,-0.19875,-1.034955,-0.938911,pole


In [94]:
dbscan_cluster[2]

Unnamed: 0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
0,-0.177057,-0.231774,1.390324,1.283748,-0.604284,-0.722718,1.533781,-0.286766,0.702417,-0.489609,pole
1,-0.180492,-0.229725,1.343595,1.115904,-0.724148,-0.715419,1.32584,-0.294062,1.282131,-0.444953,pole
2,-0.180812,-0.22988,1.15032,1.158473,-0.744518,-0.71646,1.056611,-0.297604,1.541578,-0.236845,pole


In [95]:
dbscan_cluster[3]

Unnamed: 0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
0,0.504128,-0.232086,-0.516639,-1.122019,1.408378,-0.766529,-0.582839,-0.269127,-0.864162,-0.387374,fence
1,0.565331,-0.233199,-0.554604,-1.511223,1.595622,-0.786346,-0.73168,-0.265631,-0.776632,-0.396926,fence
2,0.497318,-0.233244,-0.538195,-1.158507,1.634792,-0.821812,-0.749191,-0.269209,-0.428294,-0.229622,fence
3,0.487152,-0.229255,-0.540533,-0.810656,1.905863,-0.501589,-0.652882,-0.212951,-1.208993,-0.661364,fence


In [96]:
dbscan_cluster[4]

Unnamed: 0,varX,varY,varZ,median_Z,range_X,range_Y,range_Z,Volume,footprintDensity,volumeDensity,label
0,-0.123659,-0.066677,-0.535565,-0.127116,-0.075461,0.342256,-0.82799,-0.196947,0.226089,0.387381,building
1,-0.137331,0.08277,-0.562417,-0.183065,-0.109149,0.608238,-0.519362,-0.191003,0.089068,0.401694,building
2,0.067023,0.14144,-0.53032,-0.409289,0.936747,0.815812,-0.348631,0.204888,0.047161,-0.188392,building


# Validation

In [None]:
def validateModels(classifiedData, originalData):
    pass