# Imports

In [15]:
import glob

import numpy as np 
import pandas as pd

from scipy.spatial import ConvexHull

# scikit learn imports for classification functions
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# plotting imports
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid")

# Reading the dataset

From the readme for the xyz files, we know that:

Ground truth labels:
|File range|Label|
|:--|:--|
|    000 - 099: |building|
|    100 - 199: |car|
|    200 - 299: |fence|
|    300 - 399: |pole|
|    400 - 499: |tree|

In following cell: iterate through the files, and collect them in a dataframe

In [3]:
xyzPath = './scene_objects/data/*.xyz'
dataPathsList = glob.glob(xyzPath)
allPointsDF= pd.DataFrame(columns=['x','y','z', 'fileNo', 'groundLabel'])

def df_maker(df1, df2):
    return pd.concat([df1, df2], sort=False, ignore_index=True)

labelToGive = None
for path in dataPathsList:
    indx = int(path.split('/')[-1][0:3])
    # if else to determine label
    if indx>=0 and indx<100:
        labelToGive = 'building' 
    elif indx>=100 and indx<200:
        labelToGive = 'car' 
    elif indx>=200 and indx<300:
        labelToGive = 'fence' 
    elif indx>=300 and indx<400:
        labelToGive = 'pole' 
    elif indx>=400 and indx<500:
        labelToGive = 'tree' 

    # using pandas to read dataset and make a dataFrame
    tempDF = pd.read_csv(path, delimiter=' ', header=None, dtype=np.float64, names=['x','y','z'])
    tempDF.loc[:,'fileNo'] = indx
    tempDF.loc[:,'groundLabel'] = labelToGive

    # merge with megaDFofPoints
    allPointsDF = df_maker(allPointsDF, tempDF)

allPointsDF.head()


# Making features

normalize the feature df <br/>
[from stackoverflow we see](https://stackoverflow.com/questions/26414913/normalize-columns-of-pandas-data-frame), that we can just use pandas for a standard scaling, or else, a [standard scaler from sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) can also be applied </br>

from [answer here](https://stats.stackexchange.com/questions/417339/data-standardization-vs-normalization-for-clustering-analysis), we see that standard scaler is used for k means , so we are going with that

In [16]:
def label_determiner(indx):
    labelToGive=None
    if indx>=0 and indx<100:
        labelToGive = 'building' 
    elif indx>=100 and indx<200:
        labelToGive = 'car' 
    elif indx>=200 and indx<300:
        labelToGive = 'fence' 
    elif indx>=300 and indx<400:
        labelToGive = 'pole' 
    elif indx>=400 and indx<500:
        labelToGive = 'tree' 
    return labelToGive


featureDF = allPointsDF.groupby('fileNo').var()
featureDF.rename(columns={'x':'varX','y':'varY','z':'varZ'}, inplace=True)
featureDF.loc[:,'median_Z'] = allPointsDF.groupby('fileNo').z.median()
# featureDF.loc[:,'mean_Z'] = allPointsDF.groupby('fileNo').z.mean()

# range of x,y,z
featureDF.loc[:,'range_X'] = allPointsDF.groupby('fileNo').x.max() - allPointsDF.groupby('fileNo').x.min()
featureDF.loc[:,'range_Y'] = allPointsDF.groupby('fileNo').y.max() - allPointsDF.groupby('fileNo').y.min()
featureDF.loc[:,'range_Z'] = allPointsDF.groupby('fileNo').z.max() - allPointsDF.groupby('fileNo').z.min()

featureDF.loc[:,'Volume'] = allPointsDF.set_index('fileNo').loc[:,'x':'z'].groupby('fileNo').apply(ConvexHull).apply(lambda x: x.volume)

# points density
featureDF.loc[:,'footprintDensity'] =  allPointsDF.groupby('fileNo').count().x / (featureDF.range_X * featureDF.range_Y)
featureDF.loc[:,'volumeDensity'] =  allPointsDF.groupby('fileNo').count().x / featureDF.Volume

featureDF.loc[:,'label'] = featureDF.reset_index().fileNo.apply(label_determiner)

noLabelFeatureDF = featureDF.iloc[:,:-1].copy()

# standardize DF
standardFeatureDF = (noLabelFeatureDF - noLabelFeatureDF.mean() ) / noLabelFeatureDF.std()
standardFeatureDF = standardFeatureDF.join(other=featureDF.label , on='fileNo') # join labels to the DF

# normalize df using min max scaling
minMaxFeatureDF = (noLabelFeatureDF- noLabelFeatureDF.min()) / (noLabelFeatureDF.max() - noLabelFeatureDF.min())
minMaxFeatureDF = minMaxFeatureDF.join(other=featureDF.label , on='fileNo') # join labels to the DF

# featureDF.to_pickle('./scene_objects/featureData.pkl')
# standardFeatureDF.to_pickle('./scene_objects/standardFeatureData.pkl')
# minMaxFeatureDFv.to_pickle('./scene_objects/minMaxFeatureDF.pkl')

### Plotting to see resemblamces and clusters, if any

In [None]:
# # load df's

# featureDF = pd.read_pickle('./scene_objects/featureData.pkl')
# standardFeatureDF = pd.read_pickle('./scene_objects/standardFeatureData.pkl')
# minMaxFeatureDFv = pd.read_pickle('./scene_objects/minMaxFeatureDF.pkl')

In [None]:
sns.pairplot(data=featureDF, hue="label")

In [None]:
sns.pairplot(data=standardFeatureDF, hue="label")

In [None]:
sns.pairplot(data=minMaxFeatureDF, hue="label")

# Classification
## Split dataset to train and test

In [50]:
def testTrainSplitter(DataFrame, testSize=0.3 ,randomState=45 ):
    """
    Summary: takes a dataframe in the format of the one defined in section making features, and returns test set train set X,y
    ===================
    Arguments:
        DataFrame (pd.DataFrame):
        testSize (float):
        randomState (int):
    Returns:
    """
    X = DataFrame.drop('label', axis=1)
    y = DataFrame.loc[:,'label']
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=testSize, random_state=randomState , shuffle=True)
    return (X_train, X_test, y_train, y_test)

def predictionAccuracyChecker(predictedList, y_testSeries):
    """
    Summary: function to check the prediction vs actual labels
    Args:
        predictedList (list): list obtained from classifier
        y_testSeries (pd.Series) : pd series obtained as y_test from train_test_split
    """
    y_testList = y_testSeries.tolist()
    if len(predictedList) != len(y_testList):
        return "error, not same length"

    truePredict = 0
    falsePredict = 0
    for i,j in zip(predictedList, y_testList):
        if i==j:
            truePredict+=1
        else:
            falsePredict+=1
    return truePredict/len(predictedList)


In [43]:
# split dataset
minMax_X_train, minMax_X_test, minMax_y_train, minMax_y_test = testTrainSplitter(minMaxFeatureDF)

## SVM

In [48]:
svmClassifier = SVC(decision_function_shape='ovo', kernel='linear')
svmClassifier.fit(minMax_X_train, minMax_y_train)

predList = svmClassifier.predict(minMax_X_test)
predictionAccuracyChecker( predList , minMax_y_test)

0.88

## Random Forest

In [51]:
rfClassifier = RandomForestClassifier(n_estimators = 50, max_depth=2, random_state=0)
rfClassifier.fit(minMax_X_train, minMax_y_train)
predList = rfClassifier.predict(minMax_X_test)
predictionAccuracyChecker( predList , minMax_y_test)

0.8266666666666667