In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'telco_churn'
inp_fname = 'telco_churn.csv'


In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
cols = [
    "customerID",
"gender",
"SeniorCitizen",
"Partner",
"Dependents",
"tenure",
"PhoneService",
"MultipleLines",
"InternetService",
"OnlineSecurity",
"OnlineBackup",
"DeviceProtection",
"TechSupport",
"StreamingTV",
"StreamingMovies",
"Contract",
"PaperlessBilling",
"PaymentMethod",
"MonthlyCharges",
"TotalCharges",
"Churn",
]

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=cols)
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,'No phone service',DSL,No,...,No,No,No,No,Month-to-month,Yes,'Electronic check',29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,'One year',No,'Mailed check',56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,'Mailed check',53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,'No phone service',DSL,Yes,...,Yes,Yes,No,No,'One year',No,'Bank transfer (automatic)',42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,'Fiber optic',No,...,No,No,No,No,Month-to-month,Yes,'Electronic check',70.7,151.65,Yes


In [6]:
id_col = "customerID"
target_col = "Churn"

# Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

data[id_col] = data[id_col].astype(str)

# Shuffle Data

In [8]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
185,1024-GUALD,Female,0,Yes,No,1,No,'No phone service',DSL,No,...,No,No,No,No,Month-to-month,Yes,'Electronic check',24.8,24.8,Yes
2715,0484-JPBRU,Male,0,No,No,41,Yes,Yes,No,'No internet service',...,'No internet service','No internet service','No internet service','No internet service',Month-to-month,Yes,'Bank transfer (automatic)',25.25,996.45,No
3825,3620-EHIMZ,Female,0,Yes,Yes,52,Yes,No,No,'No internet service',...,'No internet service','No internet service','No internet service','No internet service','Two year',No,'Mailed check',19.35,1031.7,No
1807,6910-HADCM,Female,0,No,No,1,Yes,No,'Fiber optic',No,...,Yes,No,No,No,Month-to-month,No,'Electronic check',76.35,76.35,Yes
132,8587-XYZSF,Male,0,No,No,67,Yes,No,DSL,No,...,No,Yes,No,No,'Two year',No,'Bank transfer (automatic)',50.55,3260.1,No


# Save Main Data File

In [9]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(6338, 21) (705, 21)


# JSON inference request instance

In [11]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'Contract': "'Two year'",
                'Dependents': 'Yes',
                'DeviceProtection': "'No internet service'",
                'InternetService': 'No',
                'MonthlyCharges': 25.4,
                'MultipleLines': 'Yes',
                'OnlineBackup': "'No internet service'",
                'OnlineSecurity': "'No internet service'",
                'PaperlessBilling': 'No',
                'Partner': 'Yes',
                'PaymentMethod': "'Mailed check'",
                'PhoneService': 'Yes',
                'SeniorCitizen': 0,
                'StreamingMovies': "'No internet service'",
                'StreamingTV': "'No internet service'",
                'TechSupport': "'No internet service'",
                'TotalCharges': 1782.05,
                'customerID': '5854-KSRBJ',
                'gender': 'Male',
                'tenure': 70}]}
