In [1]:
import numpy as np
import pandas as pd
import os, sys
import json
import pprint

In [2]:
dataset_name = 'page_blocks'
inp_fname = 'page_blocks.csv'

In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
cols = [
    'height', 'length', 'area', 'eccen', 'p_black', 'p_and', 
    'mean_tr', 'blackpix', 'blackand', 'wb_trans',
    'class'
]

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=cols)
data.head()

Unnamed: 0,height,length,area,eccen,p_black,p_and,mean_tr,blackpix,blackand,wb_trans,class
0,5,7,35,1.4,0.4,0.657,2.33,14,23,6,1
1,6,7,42,1.167,0.429,0.881,3.6,18,37,5,1
2,6,18,108,3.0,0.287,0.741,4.43,31,80,7,1
3,5,7,35,1.4,0.371,0.743,4.33,13,26,3,1
4,6,3,18,0.5,0.5,0.944,2.25,9,17,4,1


In [6]:
id_col = "id"
target_col = "class"

In [7]:
data[target_col].value_counts()

1    4913
2     329
5     115
4      88
3      28
Name: class, dtype: int64

# Insert Id Column

In [8]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   id  height  length  area  eccen  p_black  p_and  mean_tr  blackpix  \
0   0       5       7    35  1.400    0.400  0.657     2.33        14   
1   1       6       7    42  1.167    0.429  0.881     3.60        18   
2   2       6      18   108  3.000    0.287  0.741     4.43        31   
3   3       5       7    35  1.400    0.371  0.743     4.33        13   
4   4       6       3    18  0.500    0.500  0.944     2.25         9   

   blackand  wb_trans  class  
0        23         6      1  
1        37         5      1  
2        80         7      1  
3        26         3      1  
4        17         4      1  


# Shuffle Data

In [9]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,height,length,area,eccen,p_black,p_and,mean_tr,blackpix,blackand,wb_trans,class
5442,5442,9,1,9,0.111,1.0,1.0,9.0,9,9,1,4
4962,4962,6,96,576,16.0,0.262,0.842,1.51,151,485,100,1
1084,1084,9,107,963,11.889,0.357,0.875,2.14,344,843,161,1
896,896,11,26,286,2.364,0.283,0.888,2.19,81,254,37,1
3966,3966,9,124,1116,13.778,0.367,0.705,2.85,410,787,144,1


# Save Main Data File

In [10]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(4925, 12) (548, 12)


# JSON inference request instance

In [12]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'area': 195,
                'blackand': 172,
                'blackpix': 66,
                'eccen': 7.8,
                'height': 5,
                'id': '975',
                'length': 39,
                'mean_tr': 1.74,
                'p_and': 0.882,
                'p_black': 0.338,
                'wb_trans': 38}]}
