In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'uci_churn'
inp_fname = 'uci_churn.csv'


In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname) )
data.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [5]:
id_col = "phone number"
target_col = "churn"

# Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

data[id_col] = data[id_col].astype(str)

# Shuffle Data

In [7]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
438,WY,113,510,352-6573,no,no,0,155.0,93,26.35,...,106,28.1,189.4,123,8.52,13.5,3,3.65,1,False
2674,IL,67,415,369-4377,no,no,0,109.1,117,18.55,...,124,18.48,188.4,141,8.48,12.8,6,3.46,0,False
1345,SD,98,415,392-2555,no,no,0,0.0,0,0.0,...,130,13.57,167.1,88,7.52,6.8,1,1.84,4,True
1957,KY,147,408,396-2945,no,no,0,212.8,79,36.18,...,91,17.35,156.2,113,7.03,10.2,2,2.75,1,False
2148,WY,96,408,329-2045,no,no,0,144.0,102,24.48,...,73,19.1,227.7,91,10.25,10.0,7,2.7,1,False


# Save Main Data File

In [8]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(2999, 21) (334, 21)


# JSON inference request instance

In [10]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'account length': 54,
                'area code': 408,
                'customer service calls': 2,
                'international plan': 'no',
                'number vmail messages': 0,
                'phone number': '356-1420',
                'state': 'KY',
                'total day calls': 99,
                'total day charge': 27.17,
                'total day minutes': 159.8,
                'total eve calls': 64,
                'total eve charge': 22.44,
                'total eve minutes': 264.0,
                'total intl calls': 7,
                'total intl charge': 2.62,
                'total intl minutes': 9.7,
                'total night calls': 70,
                'total night charge': 5.21,
                'total night minutes': 115.7,
                'voice mail plan': 'no'}]}
