In [23]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [24]:
dataset_name = 'credit_card'

In [25]:
input_dir = './raw'
# source: https://www.openml.org/search?type=data&sort=runs&id=31&status=active
inp_fname = "dataset_31_credit-g.arff"

output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Read Data

In [26]:
columns = [ 
    "checking_status", 
    "duration",
    "credit_history", 
    "purpose",    
    "credit_amount",
    
    "savings_status",
    "employment", 
    "installment_rate",
    "personal_status",    
    "other_parties", 
    
    "residence_since",    
    "property",
    "age", 
    "other_payment_plans",
    "housing",
    
    "existing_credits", 
    "job",
    "num_dependents", 
    "own_telephone",
    "foreign_worker",   
    
    "class"
]
columns

['checking_status',
 'duration',
 'credit_history',
 'purpose',
 'credit_amount',
 'savings_status',
 'employment',
 'installment_rate',
 'personal_status',
 'other_parties',
 'residence_since',
 'property',
 'age',
 'other_payment_plans',
 'housing',
 'existing_credits',
 'job',
 'num_dependents',
 'own_telephone',
 'foreign_worker',
 'class']

In [27]:
# read data 
with open(os.path.join(input_dir, inp_fname), 'r') as f:
    data = f.readlines()

# find the index where the data starts
data_start = data.index('@data\n') + 1
data_start

data = pd.read_csv(os.path.join(input_dir, inp_fname),
                 header=None,  # ARFF files don't have headers
                 delimiter=',', # specify the delimiter used in the ARFF file
                 skiprows=data_start, # skip the metadata and header rows
                 names=columns)
data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_rate,personal_status,other_parties,...,property,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,'<0',6,'critical/other existing credit',radio/tv,1169,'no known savings','>=7',4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,'0<=X<200',48,'existing paid',radio/tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,'<0',42,'existing paid',furniture/equipment,7882,'<100','4<=X<7',2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,'<0',24,'delayed previously','new car',4870,'<100','1<=X<4',3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


# Remove single quotes

In [28]:
# remove single quotes from categorical fields
for col in data.select_dtypes(include=['object']):
    if data[col].dtype == 'object':
        data[col] = data[col].str.strip("'")
data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_rate,personal_status,other_parties,...,property,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


# Insert Id Column

In [29]:
id_col = "id"
target_col = "class"

In [30]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    

data[id_col] = data[id_col].astype(str)
data.head()

Unnamed: 0,id,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_rate,personal_status,...,property,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


# Save Main Data File

In [31]:
data.to_csv(outp_fname, index=False)

In [32]:
data = data.replace('?', np.nan)

In [33]:
data.head()

Unnamed: 0,id,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_rate,personal_status,...,property,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,...,no known property,53,none,for free,2,skilled,2,none,yes,bad
