In [1]:
import numpy as np
import pandas as pd
import os, sys
import json
import pprint

In [2]:
dataset_name = 'steel_plate_fault'
inp_fname = 'php5s7Ep8.csv'


In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,target
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.4706,1.0,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822,Pastry
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.6,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,Pastry
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0.75,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.215,Pastry
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.5385,1.0,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,Pastry
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0,Pastry


In [5]:
id_col = "id"
target_col = "target"

# Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   id    V1    V2       V3       V4    V5  V6   V7      V8  V9  ...     V19  \
0   0    42    50   270900   270944   267  17   44   24220  76  ...  0.4706   
1   1   645   651  2538079  2538108   108  10   30   11397  84  ...  0.6000   
2   2   829   835  1553913  1553931    71   8   19    7972  99  ...  0.7500   
3   3   853   860   369370   369415   176  13   45   18996  99  ...  0.5385   
4   4  1289  1306   498078   498335  2409  60  260  246930  37  ...  0.2833   

      V20  V21     V22     V23     V24     V25     V26     V27  target  
0  1.0000  1.0  2.4265  0.9031  1.6435  0.8182 -0.2913  0.5822  Pastry  
1  0.9667  1.0  2.0334  0.7782  1.4624  0.7931 -0.1756  0.2984  Pastry  
2  0.9474  1.0  1.8513  0.7782  1.2553  0.6667 -0.1228  0.2150  Pastry  
3  1.0000  1.0  2.2455  0.8451  1.6532  0.8444 -0.1568  0.5212  Pastry  
4  0.9885  1.0  3.3818  1.2305  2.4099  0.9338 -0.1992  1.0000  Pastry  

[5 rows x 29 columns]


# Shuffle Data

In [7]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,target
1605,1605,1117,1154,1194359,1194414,1055,69,56,110262,72,...,0.5362,0.9821,1.0,3.0233,1.5682,1.7404,0.3273,-0.1835,1.0,Other_Faults
1502,1502,1572,1610,337559,337645,1570,176,128,128770,29,...,0.2159,0.6719,1.0,3.1959,1.5798,1.9345,0.5581,-0.3592,1.0,Other_Faults
70,70,1617,1628,2095494,2095507,89,14,13,7804,77,...,0.7857,1.0,1.0,1.9494,1.0414,1.1139,0.1539,-0.315,0.257,Pastry
976,976,1317,1324,2166062,2166078,78,12,16,9381,110,...,0.5833,1.0,1.0,1.8921,0.8451,1.2041,0.5625,-0.0604,0.2195,Bumps
1052,1052,386,394,1304617,1304626,51,10,9,5795,100,...,0.8,1.0,1.0,1.7076,0.9031,0.9542,0.1111,-0.1123,0.1773,Bumps


# Save Main Data File

In [8]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(1746, 29) (195, 29)


# JSON inference request instance

In [10]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'V1': 41,
                'V10': 143,
                'V11': 1400,
                'V12': 0,
                'V13': 1,
                'V14': 40,
                'V15': 0.0586,
                'V16': 0.8429,
                'V17': 0.2594,
                'V18': 0.4957,
                'V19': 0.8807,
                'V2': 735,
                'V20': 0.3371,
                'V21': 0.0,
                'V22': 4.2929,
                'V23': 2.8414,
                'V24': 2.2553,
                'V25': -0.7406,
                'V26': -0.0717,
                'V27': 1.0,
                'V3': 581949,
                'V4': 582129,
                'V5': 19629,
                'V6': 788,
                'V7': 534,
                'V8': 2332320,
                'V9': 22,
                'id': '403'}]}
