In [30]:
import numpy as np 
import pandas as pd 
import os 
import pprint
import json

In [31]:
dataset_name = "iris"

In [32]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'iris.data'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [33]:
col_names = [
    "sepal_length",
    "sepal_width",
    "petal_length",
    "petal_width",
    "class"  
]

In [34]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=col_names)
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [35]:
id_col = "id"
target_col = "class"

# Shuffle Data

In [36]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
73,6.1,2.8,4.7,1.2,Iris-versicolor
18,5.7,3.8,1.7,0.3,Iris-setosa
118,7.7,2.6,6.9,2.3,Iris-virginica
78,6.0,2.9,4.5,1.5,Iris-versicolor
76,6.8,2.8,4.8,1.4,Iris-versicolor


# Insert Id Column

In [37]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

data[id_col] = data[id_col].astype(str)

     id  sepal_length  sepal_width  petal_length  petal_width            class
73    0           6.1          2.8           4.7          1.2  Iris-versicolor
18    1           5.7          3.8           1.7          0.3      Iris-setosa
118   2           7.7          2.6           6.9          2.3   Iris-virginica
78    3           6.0          2.9           4.5          1.5  Iris-versicolor
76    4           6.8          2.8           4.8          1.4  Iris-versicolor


# Save Main Data File

In [38]:
data.drop(target_col,axis=1).to_csv(outp_fname, index=False)

# Test Key

In [39]:
data_key = data[[id_col, target_col]].rename(columns={target_col: "__target__"})
data_key.to_csv(outp_test_key_fname, index=False)

# JSON inference request instance

In [40]:
instance = data.drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'id': '0',
                'petal_length': 4.7,
                'petal_width': 1.2,
                'sepal_length': 6.1,
                'sepal_width': 2.8}]}
