In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'abalone'

In [3]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'abalone.data'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
col_names = [
    'Sex',
    'Length',
    'Diameter',
    'Height',
    'Whole weight',
    'Shucked weight',
    'Viscera weight',
    'Shell weight',
    'Rings'    
]

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None,names=col_names)
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [6]:
id_col = "Id"
target_col = "Rings"

# Shuffle Data

In [7]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
866,M,0.605,0.455,0.16,1.1035,0.421,0.3015,0.325,9
1483,M,0.59,0.44,0.15,0.8725,0.387,0.215,0.245,8
599,F,0.56,0.445,0.195,0.981,0.305,0.2245,0.335,16
1702,F,0.635,0.49,0.17,1.2615,0.5385,0.2665,0.38,9
670,M,0.475,0.385,0.145,0.6175,0.235,0.108,0.215,14


# Insert Id Column

In [8]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

      Id Sex  Length  Diameter  Height  Whole weight  Shucked weight  \
866    0   M   0.605     0.455   0.160        1.1035          0.4210   
1483   1   M   0.590     0.440   0.150        0.8725          0.3870   
599    2   F   0.560     0.445   0.195        0.9810          0.3050   
1702   3   F   0.635     0.490   0.170        1.2615          0.5385   
670    4   M   0.475     0.385   0.145        0.6175          0.2350   

      Viscera weight  Shell weight  Rings  
866           0.3015         0.325      9  
1483          0.2150         0.245      8  
599           0.2245         0.335     16  
1702          0.2665         0.380      9  
670           0.1080         0.215     14  


# Save Main Data File

In [9]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(3759, 10) (418, 10)


# JSON inference request instance

In [11]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)

with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'Diameter': 0.485,
                'Height': 0.15,
                'Id': '866',
                'Length': 0.645,
                'Sex': 'F',
                'Shell weight': 0.293,
                'Shucked weight': 0.5935,
                'Viscera weight': 0.2315,
                'Whole weight': 1.151}]}
