In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'electrical_grid'

In [3]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'electrical_grid.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [5]:
id_col = "id"
target_col = "stabf"

# Shuffle Data

In [6]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
6252,1.953136,9.692422,6.570391,7.737644,5.128952,-1.860248,-1.380687,-1.888018,0.487235,0.934229,0.433606,0.328522,unstable
4684,4.61369,9.423079,1.424423,7.372338,4.655631,-1.716394,-1.182286,-1.756951,0.395106,0.772211,0.911199,0.328193,unstable
1731,2.738001,1.341172,7.986504,9.842747,5.00761,-1.275363,-1.877241,-1.855006,0.16231,0.175994,0.897505,0.601356,unstable
4742,2.006512,2.31177,4.410704,2.726592,4.396237,-1.532751,-1.327943,-1.535543,0.053894,0.57344,0.061032,0.775629,stable
4521,3.993497,3.742481,8.975171,1.182813,2.875057,-1.170108,-1.117862,-0.587086,0.100005,0.753533,0.437366,0.611471,stable


# Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

data[id_col] = data[id_col].astype(str)

      id      tau1      tau2      tau3      tau4        p1        p2  \
6252   0  1.953136  9.692422  6.570391  7.737644  5.128952 -1.860248   
4684   1  4.613690  9.423079  1.424423  7.372338  4.655631 -1.716394   
1731   2  2.738001  1.341172  7.986504  9.842747  5.007610 -1.275363   
4742   3  2.006512  2.311770  4.410704  2.726592  4.396237 -1.532751   
4521   4  3.993497  3.742481  8.975171  1.182813  2.875057 -1.170108   

            p3        p4        g1        g2        g3        g4     stabf  
6252 -1.380687 -1.888018  0.487235  0.934229  0.433606  0.328522  unstable  
4684 -1.182286 -1.756951  0.395106  0.772211  0.911199  0.328193  unstable  
1731 -1.877241 -1.855006  0.162310  0.175994  0.897505  0.601356  unstable  
4742 -1.327943 -1.535543  0.053894  0.573440  0.061032  0.775629    stable  
4521 -1.117862 -0.587086  0.100005  0.753533  0.437366  0.611471    stable  


# Save Main Data File

In [8]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(9000, 14) (1000, 14)


# JSON inference request instance

In [10]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'g1': 0.124902737,
                'g2': 0.86646372,
                'g3': 0.135315907,
                'g4': 0.078401342,
                'id': '6252',
                'p1': 3.925026138,
                'p2': -1.096750364,
                'p3': -1.080586212,
                'p4': -1.747689562,
                'tau1': 4.348282297,
                'tau2': 3.932069895,
                'tau3': 4.24333351,
                'tau4': 7.213924232}]}
