In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'diamond'

In [3]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'diamond.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
0,1.1,Ideal,H,SI1,VG,EX,GIA,5169
1,0.83,Ideal,H,VS1,ID,ID,AGSL,3470
2,0.85,Ideal,H,SI1,EX,EX,GIA,3183
3,0.91,Ideal,E,SI1,VG,VG,GIA,4370
4,0.83,Ideal,G,SI1,EX,EX,GIA,3171


In [5]:
id_col = "Id"
target_col = "Price"

# Shuffle Data

In [6]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
1782,1.58,Ideal,H,VS2,EX,VG,GIA,11419
3917,1.63,Ideal,G,VS1,EX,EX,GIA,16241
221,1.33,Ideal,G,VVS2,EX,EX,GIA,12345
2135,1.06,Very Good,I,SI1,VG,EX,GIA,4507
5224,1.01,Very Good,H,VVS2,EX,VG,GIA,5908


# Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

      Id  Carat Weight        Cut Color Clarity Polish Symmetry Report  Price
1782   0          1.58      Ideal     H     VS2     EX       VG    GIA  11419
3917   1          1.63      Ideal     G     VS1     EX       EX    GIA  16241
221    2          1.33      Ideal     G    VVS2     EX       EX    GIA  12345
2135   3          1.06  Very Good     I     SI1     VG       EX    GIA   4507
5224   4          1.01  Very Good     H    VVS2     EX       VG    GIA   5908


# Save Main Data File

In [8]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(5400, 9) (600, 9)


# JSON inference request instance

In [10]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)

with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'Carat Weight': 0.91,
                'Clarity': 'SI1',
                'Color': 'F',
                'Cut': 'Very Good',
                'Id': '1782',
                'Polish': 'VG',
                'Report': 'GIA',
                'Symmetry': 'VG'}]}
