In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'credit_card'

In [13]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'dataset_29_credit-a.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [5]:
id_col = "id"
target_col = "class"

# Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

   id A1     A2     A3 A4 A5 A6 A7    A8 A9 A10  A11 A12 A13    A14  A15 class
0   0  b  30.83  0.000  u  g  w  v  1.25  t   t    1   f   g  00202    0     +
1   1  a  58.67  4.460  u  g  q  h  3.04  t   t    6   f   g  00043  560     +
2   2  a  24.50  0.500  u  g  q  h  1.50  t   f    0   f   g  00280  824     +
3   3  b  27.83  1.540  u  g  w  v  3.75  t   t    5   t   g  00100    3     +
4   4  b  20.17  5.625  u  g  w  v  1.71  t   f    0   f   s  00120    0     +


# Shuffle Data

In [7]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
286,286,a,?,1.5,u,g,ff,ff,0.0,f,t,2,t,g,200,105,-
511,511,a,46.00,4.0,u,g,j,j,0.0,t,f,0,f,g,100,960,+
257,257,b,20.00,0.0,u,g,d,v,0.5,f,f,0,f,g,144,0,-
336,336,b,47.33,6.5,u,g,c,v,1.0,f,f,0,t,g,0,228,-
318,318,b,19.17,0.0,y,p,m,bb,0.0,f,f,0,t,s,500,1,+


# Replace null indicators

In [8]:
data = data.replace('?', np.nan)

# Update Class labels

In [9]:
mapping = {
    "+": "positive", 
    "-": "negative"
}

data['class'] = data['class'].map(mapping)
data.head()

Unnamed: 0,id,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
286,286,a,,1.5,u,g,ff,ff,0.0,f,t,2,t,g,200,105,negative
511,511,a,46.0,4.0,u,g,j,j,0.0,t,f,0,f,g,100,960,positive
257,257,b,20.0,0.0,u,g,d,v,0.5,f,f,0,f,g,144,0,negative
336,336,b,47.33,6.5,u,g,c,v,1.0,f,f,0,t,g,0,228,negative
318,318,b,19.17,0.0,y,p,m,bb,0.0,f,f,0,t,s,500,1,positive


# Save Main Data File

In [10]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(621, 17) (69, 17)


# JSON inference request instance

In [14]:
instance = data_test.drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'A1': 'b',
                'A10': 'f',
                'A11': 0,
                'A12': 'f',
                'A13': 'g',
                'A14': '00120',
                'A15': 0,
                'A2': '32.42',
                'A3': 2.165,
                'A4': 'y',
                'A5': 'p',
                'A6': 'k',
                'A7': 'ff',
                'A8': 0.0,
                'A9': 'f',
                'id': 633}]}
