In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'landsat_satellite'
inp_fname1 = 'sat.trn'
inp_fname2 = 'sat.tst'

In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
cols = [ f"v_{i}" for i in range(36) ] + ["class"]
cols

['v_0',
 'v_1',
 'v_2',
 'v_3',
 'v_4',
 'v_5',
 'v_6',
 'v_7',
 'v_8',
 'v_9',
 'v_10',
 'v_11',
 'v_12',
 'v_13',
 'v_14',
 'v_15',
 'v_16',
 'v_17',
 'v_18',
 'v_19',
 'v_20',
 'v_21',
 'v_22',
 'v_23',
 'v_24',
 'v_25',
 'v_26',
 'v_27',
 'v_28',
 'v_29',
 'v_30',
 'v_31',
 'v_32',
 'v_33',
 'v_34',
 'v_35',
 'class']

In [5]:
data1 = pd.read_csv(os.path.join(input_dir, inp_fname1) , sep=" ", header=None, )
# data1.head()

data2 = pd.read_csv(os.path.join(input_dir, inp_fname2) , sep=" ", header=None, )
# data2.head()

data = pd.concat([data1, data2], ignore_index=True)
data.columns = cols
data.head()

Unnamed: 0,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,...,v_27,v_28,v_29,v_30,v_31,v_32,v_33,v_34,v_35,class
0,92,115,120,94,84,102,106,79,84,102,...,104,88,121,128,100,84,107,113,87,3
1,84,102,106,79,84,102,102,83,80,102,...,100,84,107,113,87,84,99,104,79,3
2,84,102,102,83,80,102,102,79,84,94,...,87,84,99,104,79,84,99,104,79,3
3,80,102,102,79,84,94,102,79,80,94,...,79,84,99,104,79,84,103,104,79,3
4,84,94,102,79,80,94,98,76,80,102,...,79,84,103,104,79,79,107,109,87,3


In [6]:
id_col = "id"
target_col = "class"

# Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   id  v_0  v_1  v_2  v_3  v_4  v_5  v_6  v_7  v_8  ...  v_27  v_28  v_29  \
0   0   92  115  120   94   84  102  106   79   84  ...   104    88   121   
1   1   84  102  106   79   84  102  102   83   80  ...   100    84   107   
2   2   84  102  102   83   80  102  102   79   84  ...    87    84    99   
3   3   80  102  102   79   84   94  102   79   80  ...    79    84    99   
4   4   84   94  102   79   80   94   98   76   80  ...    79    84   103   

   v_30  v_31  v_32  v_33  v_34  v_35  class  
0   128   100    84   107   113    87      3  
1   113    87    84    99   104    79      3  
2   104    79    84    99   104    79      3  
3   104    79    84   103   104    79      3  
4   104    79    79   107   109    87      3  

[5 rows x 38 columns]


# Shuffle Data

In [8]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,...,v_27,v_28,v_29,v_30,v_31,v_32,v_33,v_34,v_35,class
2436,2436,68,69,74,57,64,66,71,54,64,...,58,63,68,67,58,67,72,70,62,7
3361,3361,71,112,122,96,71,112,122,96,71,...,96,66,113,122,96,70,113,117,100,1
233,233,82,96,100,81,82,100,104,78,82,...,83,82,102,105,83,82,102,114,87,3
3667,3667,56,68,83,74,64,83,100,85,64,...,70,52,71,80,74,52,71,84,70,1
5011,5011,55,51,77,67,55,54,77,62,59,...,53,56,51,68,60,56,51,75,68,5


# Save Main Data File

In [9]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(5791, 38) (644, 38)


# JSON inference request instance

In [11]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'id': '4631',
                'v_0': 88,
                'v_1': 107,
                'v_10': 113,
                'v_11': 85,
                'v_12': 84,
                'v_13': 107,
                'v_14': 109,
                'v_15': 92,
                'v_16': 88,
                'v_17': 107,
                'v_18': 113,
                'v_19': 92,
                'v_2': 113,
                'v_20': 84,
                'v_21': 103,
                'v_22': 109,
                'v_23': 87,
                'v_24': 82,
                'v_25': 104,
                'v_26': 112,
                'v_27': 89,
                'v_28': 86,
                'v_29': 109,
                'v_3': 92,
                'v_30': 112,
                'v_31': 92,
                'v_32': 86,
                'v_33': 109,
                'v_34': 112,
                'v_35': 89,
                'v_4': 92,
                'v_5': 112,
                'v_6': 122,
                'v_7': 92,
         