In [3]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [4]:
dataset_name = 'segment'

In [5]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'segment.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [6]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,region-centroid-col,region-centroid-row,region-pixel-count,short-line-density-5,short-line-density-2,vedge-mean,vegde-sd,hedge-mean,hedge-sd,intensity-mean,rawred-mean,rawblue-mean,rawgreen-mean,exred-mean,exblue-mean,exgreen-mean,value-mean,saturation-mean,hue-mean,binaryClass
0,218,178,9,0.111111,0.0,0.833333,0.547722,1.11111,0.544331,59.6296,52.4444,75.2222,51.2222,-21.5556,46.7778,-25.2222,75.2222,0.318996,-2.04055,N
1,113,130,9,0.0,0.0,0.277778,0.250924,0.333333,0.365148,0.888889,0.0,2.55556,0.111111,-2.66667,5.0,-2.33333,2.55556,1.0,-2.12325,N
2,202,41,9,0.0,0.0,0.944448,0.772202,1.11111,1.0256,123.037,111.889,139.778,117.444,-33.4444,50.2222,-16.7778,139.778,0.199347,-2.29992,N
3,32,173,9,0.0,0.0,1.72222,1.78159,9.0,6.74949,43.5926,39.5556,52.8889,38.3333,-12.1111,27.8889,-15.7778,52.8889,0.266914,-1.99886,N
4,61,197,9,0.0,0.0,1.44444,1.51535,2.61111,1.92546,49.5926,44.2222,61.5556,43.0,-16.1111,35.8889,-19.7778,61.5556,0.302925,-2.02227,N


In [7]:
id_col = "id"
target_col = "binaryClass"

# Shuffle Data

In [8]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,region-centroid-col,region-centroid-row,region-pixel-count,short-line-density-5,short-line-density-2,vedge-mean,vegde-sd,hedge-mean,hedge-sd,intensity-mean,rawred-mean,rawblue-mean,rawgreen-mean,exred-mean,exblue-mean,exgreen-mean,value-mean,saturation-mean,hue-mean,binaryClass
1644,64,74,9,0.0,0.0,1.33333,0.577777,1.38889,0.907407,21.1481,21.0,26.8889,15.5556,-0.444444,17.2222,-16.7778,26.8889,0.419204,-1.55423,P
509,216,17,9,0.0,0.0,0.666665,0.5164,1.27778,1.02017,126.148,115.0,141.889,121.556,-33.4444,47.2222,-13.7778,141.889,0.189464,-2.34842,N
1410,242,183,9,0.0,0.0,1.5,0.936898,2.16667,1.79815,15.3704,12.6667,12.4444,21.0,-8.11111,-8.77778,16.8889,21.0,0.420244,2.07587,N
44,118,125,9,0.0,0.0,0.333333,0.298142,0.888889,0.344265,1.14815,0.0,3.11111,0.333333,-3.44444,5.88889,-2.44444,3.11111,1.0,-2.17565,N
1584,217,45,9,0.111111,0.0,0.888888,0.207408,1.55556,0.829628,121.519,111.111,137.333,116.111,-31.2222,47.4444,-16.2222,137.333,0.190912,-2.29267,N


# Insert Id Column

In [9]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

      id  region-centroid-col  region-centroid-row  region-pixel-count  \
1644   0                   64                   74                   9   
509    1                  216                   17                   9   
1410   2                  242                  183                   9   
44     3                  118                  125                   9   
1584   4                  217                   45                   9   

      short-line-density-5  short-line-density-2  vedge-mean  vegde-sd  \
1644              0.000000                   0.0    1.333330  0.577777   
509               0.000000                   0.0    0.666665  0.516400   
1410              0.000000                   0.0    1.500000  0.936898   
44                0.000000                   0.0    0.333333  0.298142   
1584              0.111111                   0.0    0.888888  0.207408   

      hedge-mean  hedge-sd  ...  rawred-mean  rawblue-mean  rawgreen-mean  \
1644    1.388890  0.907407  ...  

# Save Main Data File

In [10]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(2079, 21) (231, 21)


# JSON inference request instance

In [12]:
instance = data_test.drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'exblue-mean': 43.1111,
                'exgreen-mean': -24.2222,
                'exred-mean': -18.8889,
                'hedge-mean': 1.0,
                'hedge-sd': 0.666664,
                'hue-mean': -2.01531,
                'id': 1644.0,
                'intensity-mean': 113.185,
                'rawblue-mean': 127.556,
                'rawgreen-mean': 105.111,
                'rawred-mean': 106.889,
                'region-centroid-col': 9.0,
                'region-centroid-row': 94.0,
                'region-pixel-count': 9.0,
                'saturation-mean': 0.176789,
                'short-line-density-2': 0.0,
                'short-line-density-5': 0.0,
                'value-mean': 127.556,
                'vedge-mean': 0.611111,
                'vegde-sd': 0.551852}]}
