In [63]:
import numpy as np
import pandas as pd
import os
import pprint
import json

In [64]:
dataset_name = "penguins"

In [65]:
input_dir = './raw'
output_dir = './processed'

inp_fname = f'{data_name}.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [66]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


# Fields

In [68]:
id_col = "id"
target_col = "species"

# Shuffle Data

In [69]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
194,Chinstrap,Dream,50.9,19.1,196.0,3550.0,MALE
157,Chinstrap,Dream,45.2,17.8,198.0,3950.0,FEMALE
225,Gentoo,Biscoe,46.5,13.5,210.0,4550.0,FEMALE
208,Chinstrap,Dream,45.2,16.6,191.0,3250.0,FEMALE
318,Gentoo,Biscoe,48.4,14.4,203.0,4625.0,FEMALE


# Insert Id Column

In [70]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

data[id_col] = data[id_col].astype(str)

     id    species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
194   0  Chinstrap   Dream            50.9           19.1              196.0   
157   1  Chinstrap   Dream            45.2           17.8              198.0   
225   2     Gentoo  Biscoe            46.5           13.5              210.0   
208   3  Chinstrap   Dream            45.2           16.6              191.0   
318   4     Gentoo  Biscoe            48.4           14.4              203.0   

     body_mass_g     sex  
194       3550.0    MALE  
157       3950.0  FEMALE  
225       4550.0  FEMALE  
208       3250.0  FEMALE  
318       4625.0  FEMALE  


# Retain Fields



In [71]:
data.columns

Index(['id', 'species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

In [72]:
data = data[['id', 'species', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
data.head()

Unnamed: 0,id,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
194,0,Chinstrap,50.9,19.1,196.0,3550.0
157,1,Chinstrap,45.2,17.8,198.0,3950.0
225,2,Gentoo,46.5,13.5,210.0,4550.0
208,3,Chinstrap,45.2,16.6,191.0,3250.0
318,4,Gentoo,48.4,14.4,203.0,4625.0


# Drop NA

In [73]:
data.shape

(344, 6)

In [74]:
data.dropna(inplace=True)
print(data.shape)

(342, 6)


# Save Main Data File

In [75]:
data.drop(target_col,axis=1).to_csv(outp_fname, index=False)

# Test Key

In [76]:
data_key = data[[id_col, target_col]].rename(columns={target_col: "__target__"})
data_key.to_csv(outp_test_key_fname, index=False)

# JSON inference request instance

In [77]:
instance = data.drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'bill_depth_mm': 19.1,
                'bill_length_mm': 50.9,
                'body_mass_g': 3550.0,
                'flipper_length_mm': 196.0,
                'id': '0'}]}
