In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'energy'

In [3]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'energy.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84


In [5]:
id_col = "Id"
target_col = "Heating Load"

# Shuffle Data

In [6]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load
668,0.62,808.5,367.5,220.5,3.5,2,0.4,3,16.47
324,0.66,759.5,318.5,220.5,3.5,2,0.25,1,13.17
624,0.98,514.5,294.0,110.25,7.0,2,0.4,3,32.82
690,0.79,637.0,343.0,147.0,7.0,4,0.4,4,41.32
473,0.64,784.0,343.0,220.5,3.5,3,0.25,4,16.69


# Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

     Id  Relative Compactness  Surface Area  Wall Area  Roof Area  \
668   0                  0.62         808.5      367.5     220.50   
324   1                  0.66         759.5      318.5     220.50   
624   2                  0.98         514.5      294.0     110.25   
690   3                  0.79         637.0      343.0     147.00   
473   4                  0.64         784.0      343.0     220.50   

     Overall Height  Orientation  Glazing Area  Glazing Area Distribution  \
668             3.5            2          0.40                          3   
324             3.5            2          0.25                          1   
624             7.0            2          0.40                          3   
690             7.0            4          0.40                          4   
473             3.5            3          0.25                          4   

     Heating Load  
668         16.47  
324         13.17  
624         32.82  
690         41.32  
473         16.69  


# Save Main Data File

In [8]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(691, 10) (77, 10)


# JSON inference request instance

In [10]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'Glazing Area': 0.1,
                'Glazing Area Distribution': 2,
                'Id': '668',
                'Orientation': 4,
                'Overall Height': 3.5,
                'Relative Compactness': 0.64,
                'Roof Area': 220.5,
                'Surface Area': 784.0,
                'Wall Area': 343.0}]}
