Imports

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel
import pickle
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

Import dataset

In [2]:
with open('shape_features.csv', 'r') as data:
    df = pd.read_csv(data)

Shuffle data

In [3]:
from sklearn.utils import shuffle
data = shuffle(df)

Separate features from labels, and split into train and test sets. Drop useless features


In [4]:
from sklearn.model_selection import train_test_split

X = df.iloc[:,0:-1]
y = df.iloc[:,-1]
columns = X.columns
print(columns)
X = X.drop(columns=['Max speed squared', 'Min speed squared', 'Total duration', 'Bounding box area', 'Movement', 'Bounding box height', 'Bounding box width', 'Length ratio'])

Index(['Angle of BB diagonal', 'Aspect', 'Average curvature',
       'Average squared speed', 'Bounding box area', 'Bounding box diagonal',
       'Bounding box height', 'Bounding box width', 'Convex hull area ratio',
       'Cos of first to last', 'Cos of initial angle', 'Curviness', 'DCR',
       'Density 1', 'Density 2', 'Distance first to last',
       'Enclosing shape5 ratio', 'Entropy', 'Least shape3 error',
       'Length perimeter ratio', 'Length ratio', 'Log BB area', 'Log aspect',
       'Log longest BB side', 'Log total length', 'Max curvature',
       'Max speed squared', 'Min speed squared', 'Movement', 'NDDE',
       'Openness', 'Overtracing', 'Perimeter efficiency', 'Perimeter to area',
       'Point ratio', 'Sharpness', 'Sin of first to last',
       'Sin of initial angle', 'Thinness ratio', 'Total angle',
       'Total angle / total length', 'Total duration', 'Total length',
       'Total length / BB diagonal', 'Width to height ratio'],
      dtype='object')


Feature Selection

In [5]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

selector = SelectFromModel(estimator=RandomForestClassifier(), threshold='mean').fit(X,y)
names = selector.get_feature_names_out(X.columns)
X = selector.transform(X)
X = pd.DataFrame(X, columns=names)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

Dataset Statistics

In [6]:
print(str(len(X_train)) + " samples in the training set")
print(str(len(X_test)) + " samples in the test set")
print(str(len(X_train.columns)) + " features")
print(str(len(set(y_train))) + " labels")
print("Labels are:", set(y_train))
print("Features are:", set(X_train.columns))

189 samples in the training set
21 samples in the test set
12 features
5 labels
Labels are: {'shape1', 'shape5', 'shape2', 'shape4', 'shape3'}
Features are: {'Density 1', 'Convex hull area ratio', 'Openness', 'Length perimeter ratio', 'Distance first to last', 'Total length / BB diagonal', 'Thinness ratio', 'Perimeter efficiency', 'Width to height ratio', 'Angle of BB diagonal', 'Point ratio', 'Log aspect'}


Train Model

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10, criterion='gini', verbose=False)
model.fit(X,y)

print("Training Accuracy", model.score(X_train, y_train)*100, "%")
print("------")
print("Test Accuracy", model.score(X_test, y_test)*100, "%")



Training Accuracy 98.4126984126984 %
------
Test Accuracy 100.0 %


Print Classification Report

In [8]:
from sklearn.metrics import classification_report
print(classification_report(model.predict(X_test),y_test))

              precision    recall  f1-score   support

      shape1       1.00      1.00      1.00         6
      shape2       1.00      1.00      1.00         4
      shape3       1.00      1.00      1.00         4
      shape4       1.00      1.00      1.00         6
      shape5       1.00      1.00      1.00         1

    accuracy                           1.00        21
   macro avg       1.00      1.00      1.00        21
weighted avg       1.00      1.00      1.00        21



Save Model

In [9]:
pickle.dump(model, open('Ex5_shape_classifier_random_forest.sav','wb'))