In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'ailerons'

In [3]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'ailerons.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname)  )
print(data.shape)
data.head()

(13750, 41)


Unnamed: 0,climbRate,Sgz,p,q,curPitch,curRoll,absRoll,diffClb,diffRollRate,diffDiffClb,...,diffSeTime8,diffSeTime9,diffSeTime10,diffSeTime11,diffSeTime12,diffSeTime13,diffSeTime14,alpha,Se,goal
0,2,-56,-0.33,-0.09,0.9,0.2,-11,12,0.004,-0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.032,-0.0009
1,470,-39,0.02,0.12,0.39,-0.6,-12,8,0.009,-1.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.034,-0.0011
2,165,4,0.14,0.14,0.78,0.4,-11,-9,-0.003,-0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.034,-0.0012
3,-113,5,-0.12,0.11,1.06,0.6,-10,-7,-0.008,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.033,-0.0011
4,-411,-21,-0.17,0.07,1.33,-0.6,-11,0,0.002,0.1,...,0.0,0.0,0.0,0.0,0.0,-0.002,0.0,0.9,0.032,-0.0008


In [5]:
id_col = "id"
target_col = "goal"

# Shuffle Data

In [6]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,climbRate,Sgz,p,q,curPitch,curRoll,absRoll,diffClb,diffRollRate,diffDiffClb,...,diffSeTime8,diffSeTime9,diffSeTime10,diffSeTime11,diffSeTime12,diffSeTime13,diffSeTime14,alpha,Se,goal
2439,-102,-18,-0.06,0.01,0.63,1.2,-6,-1,-0.014,0.1,...,0.0,0.0,0.0,-0.002,0.0,-0.002,0.0,0.5,0.019,-0.0004
8171,-523,-51,-0.31,-0.35,0.94,-0.8,-11,4,-0.005,8.7,...,0.0,-0.001,0.0,0.0,0.0,0.0,0.0,0.5,0.015,-0.0004
10742,-22,-34,0.33,0.1,0.67,2.4,-11,-5,-0.019,-0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.023,-0.0013
3328,53,-46,0.4,0.03,0.47,0.7,-10,3,-0.011,0.0,...,0.0,0.0,0.0,0.001,0.0,0.001,0.0,0.5,0.018,-0.0009
5999,108,9,-0.11,0.03,0.4,1.5,-5,-2,-0.015,-0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.018,-0.0005


# Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

       id  climbRate  Sgz     p     q  curPitch  curRoll  absRoll  diffClb  \
2439    0       -102  -18 -0.06  0.01      0.63      1.2       -6       -1   
8171    1       -523  -51 -0.31 -0.35      0.94     -0.8      -11        4   
10742   2        -22  -34  0.33  0.10      0.67      2.4      -11       -5   
3328    3         53  -46  0.40  0.03      0.47      0.7      -10        3   
5999    4        108    9 -0.11  0.03      0.40      1.5       -5       -2   

       diffRollRate  ...  diffSeTime8  diffSeTime9  diffSeTime10  \
2439         -0.014  ...          0.0        0.000           0.0   
8171         -0.005  ...          0.0       -0.001           0.0   
10742        -0.019  ...          0.0        0.000           0.0   
3328         -0.011  ...          0.0        0.000           0.0   
5999         -0.015  ...          0.0        0.000           0.0   

       diffSeTime11  diffSeTime12  diffSeTime13  diffSeTime14  alpha     Se  \
2439         -0.002           0.0        -0

# Save Main Data File

In [8]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(12375, 42) (1375, 42)


# JSON inference request instance

In [10]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'Se': 0.02,
                'SeTime1': 0.02,
                'SeTime10': 0.02,
                'SeTime11': 0.02,
                'SeTime12': 0.02,
                'SeTime13': 0.02,
                'SeTime14': 0.02,
                'SeTime2': 0.02,
                'SeTime3': 0.02,
                'SeTime4': 0.02,
                'SeTime5': 0.02,
                'SeTime6': 0.02,
                'SeTime7': 0.02,
                'SeTime8': 0.02,
                'SeTime9': 0.02,
                'Sgz': 14,
                'absRoll': -11,
                'alpha': 0.6,
                'climbRate': -41,
                'curPitch': 0.59,
                'curRoll': -0.7,
                'diffClb': -12,
                'diffDiffClb': 0.1,
                'diffRollRate': -0.021,
                'diffSeTime1': 0.0,
                'diffSeTime10': 0.0,
                'diffSeTime11': 0.0,
                'diffSeTime12': 0.0,
                'diffSeTime13': 0.0,
                'diffSe