In [1]:
import numpy as np
import pandas as pd
import os, sys
import json
import pprint

In [2]:
dataset_name = 'vehicle_silhouettes'
inp_fname = 'vehicle_silhouettes.csv'

In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
cols = [
    'COMPACTNESS',
    'CIRCULARITY',
    'DISTANCE CIRCULARITY',
    'RADIUS RATIO',
    'PR.AXIS ASPECT RATIO',
    'MAX.LENGTH ASPECT RATIO',
    'SCATTER RATIO',
    'ELONGATEDNESS',
    'PR.AXIS RECTANGULARITY',
    'MAX.LENGTH RECTANGULARITY',
    'SCALED VARIANCE ALONG MAJOR AXIS',
    'SCALED VARIANCE ALONG MINOR AXIS',
    'SCALED RADIUS OF GYRATION',
    'SKEWNESS ABOUT MAJOR AXIS',
    'SKEWNESS ABOUT MINOR AXIS',
    'KURTOSIS ABOUT MAJOR AXIS',
    'KURTOSIS ABOUT MINOR AXIS',
    'HOLLOWS RATIO',
    'CLASS'
]

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=cols)
data.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE CIRCULARITY,RADIUS RATIO,PR.AXIS ASPECT RATIO,MAX.LENGTH ASPECT RATIO,SCATTER RATIO,ELONGATEDNESS,PR.AXIS RECTANGULARITY,MAX.LENGTH RECTANGULARITY,SCALED VARIANCE ALONG MAJOR AXIS,SCALED VARIANCE ALONG MINOR AXIS,SCALED RADIUS OF GYRATION,SKEWNESS ABOUT MAJOR AXIS,SKEWNESS ABOUT MINOR AXIS,KURTOSIS ABOUT MAJOR AXIS,KURTOSIS ABOUT MINOR AXIS,HOLLOWS RATIO,CLASS
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus


In [6]:
data.shape

(846, 19)

In [7]:
id_col = "ID"
target_col = "CLASS"

# Insert Id Column

In [8]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   ID  COMPACTNESS  CIRCULARITY  DISTANCE CIRCULARITY  RADIUS RATIO  \
0   0           95           48                    83           178   
1   1           91           41                    84           141   
2   2          104           50                   106           209   
3   3           93           41                    82           159   
4   4           85           44                    70           205   

   PR.AXIS ASPECT RATIO  MAX.LENGTH ASPECT RATIO  SCATTER RATIO  \
0                    72                       10            162   
1                    57                        9            149   
2                    66                       10            207   
3                    63                        9            144   
4                   103                       52            149   

   ELONGATEDNESS  PR.AXIS RECTANGULARITY  MAX.LENGTH RECTANGULARITY  \
0             42                      20                        159   
1             45            

# Shuffle Data

In [9]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,ID,COMPACTNESS,CIRCULARITY,DISTANCE CIRCULARITY,RADIUS RATIO,PR.AXIS ASPECT RATIO,MAX.LENGTH ASPECT RATIO,SCATTER RATIO,ELONGATEDNESS,PR.AXIS RECTANGULARITY,MAX.LENGTH RECTANGULARITY,SCALED VARIANCE ALONG MAJOR AXIS,SCALED VARIANCE ALONG MINOR AXIS,SCALED RADIUS OF GYRATION,SKEWNESS ABOUT MAJOR AXIS,SKEWNESS ABOUT MINOR AXIS,KURTOSIS ABOUT MAJOR AXIS,KURTOSIS ABOUT MINOR AXIS,HOLLOWS RATIO,CLASS
39,39,81,45,68,169,73,6,151,44,19,146,173,336,186,75,7,0,183,189,bus
250,250,95,38,66,126,52,8,133,52,18,140,158,253,140,78,11,8,184,183,van
314,314,90,42,63,126,55,7,152,45,19,142,173,336,173,81,0,15,180,184,bus
96,96,89,42,80,151,62,6,144,46,19,139,166,308,170,74,17,13,185,189,saab
198,198,81,46,71,130,56,7,153,44,19,149,172,342,191,81,3,14,180,186,bus


# Save Main Data File

In [10]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(761, 20) (85, 20)


# JSON inference request instance

In [12]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'CIRCULARITY': 51,
                'COMPACTNESS': 102,
                'DISTANCE CIRCULARITY': 104,
                'ELONGATEDNESS': 32,
                'HOLLOWS RATIO': 197,
                'ID': '736',
                'KURTOSIS ABOUT MAJOR AXIS': 19,
                'KURTOSIS ABOUT MINOR AXIS': 188,
                'MAX.LENGTH ASPECT RATIO': 10,
                'MAX.LENGTH RECTANGULARITY': 162,
                'PR.AXIS ASPECT RATIO': 67,
                'PR.AXIS RECTANGULARITY': 23,
                'RADIUS RATIO': 217,
                'SCALED RADIUS OF GYRATION': 195,
                'SCALED VARIANCE ALONG MAJOR AXIS': 220,
                'SCALED VARIANCE ALONG MINOR AXIS': 621,
                'SCATTER RATIO': 204,
                'SKEWNESS ABOUT MAJOR AXIS': 68,
                'SKEWNESS ABOUT MINOR AXIS': 3}]}
