This files creates the training and test datasets for all other notebooks and scripts to reuse. This is the file I would typically use for an exploratory analysis.

In [1]:
import pickle
import helpsk as hlp
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

%matplotlib inline

# Load Data

In [2]:
#https://www.openml.org/d/31
credit_g = fetch_openml('credit-g', version=1)
credit_data = credit_g['data']
credit_data['target'] = credit_g['target']
print(credit_data.shape)

del credit_g

(1000, 21)


In [None]:
## Create Missing Values
credit_data['duration'].iloc[0:46] = np.nan
credit_data['checking_status'].iloc[25:75] = np.nan
credit_data['credit_amount'].iloc[10:54] = 0

In [4]:
hlp.pandas.numeric_summary(credit_data)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,954,46,5.0%,0,0.0%,20.9,12.0,0.6,1.1,1.0,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0%,44,4.0%,3132.9,2853.4,0.9,1.9,4.3,0.0,740.0,1287.8,2224.0,3873.5,7119.8,18424.0
installment_commitment,1000,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0%,0,0.0%,35.5,11.4,0.3,1.0,0.6,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0%,0,0.0%,1.2,0.4,0.3,1.9,1.6,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [5]:
hlp.pandas.non_numeric_summary(credit_data)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,950,50,5.0%,no checking,4,0.4%
credit_history,1000,0,0.0%,existing paid,5,0.5%
purpose,1000,0,0.0%,radio/tv,10,1.0%
savings_status,1000,0,0.0%,<100,5,0.5%
employment,1000,0,0.0%,1<=X<4,5,0.5%
personal_status,1000,0,0.0%,male single,4,0.4%
other_parties,1000,0,0.0%,none,3,0.3%
property_magnitude,1000,0,0.0%,car,4,0.4%
other_payment_plans,1000,0,0.0%,none,3,0.3%
housing,1000,0,0.0%,own,3,0.3%


# Training and Test Data

In [6]:
y_full = credit_data['target']
X_full = credit_data.drop(columns='target')

del credit_data

In [7]:
hlp.pandas.value_frequency(series=y_full)

Unnamed: 0,Frequency,Percent
good,700,0.7
bad,300,0.3


In [8]:
y_full[0:10]

0    good
1     bad
2    good
3    good
4     bad
5    good
6    good
7    good
8    good
9     bad
Name: target, dtype: category
Categories (2, object): ['good', 'bad']

In [9]:
from sklearn.preprocessing import label_binarize
# i.e. value of 0 is 'good' i.e. 'not default' and value of 1 is bad and what
# we want to detect i.e. 'default'
y_full = label_binarize(y_full, classes=['good', 'bad']).flatten()

In [10]:
y_full[0:10]

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1])

In [11]:
# save the full dataset for creating the fine model
with open('X_full.pkl', 'wb') as handle:
    pickle.dump(X_full, handle)

with open('y_full.pkl', 'wb') as handle:
    pickle.dump(y_full, handle)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)
del X_full, y_full

In [13]:
print(X_train.shape)
print(len(y_train))

print(X_test.shape)
print(len(y_test))

(800, 20)
800
(200, 20)
200


In [14]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([559, 241]))

In [15]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.69875, 0.30125])

In [16]:
np.unique(y_test, return_counts=True)[1] / np.sum(np.unique(y_test, return_counts=True)[1])

array([0.705, 0.295])

In [17]:
import pickle

with open('X_train.pkl', 'wb') as handle:
    pickle.dump(X_train, handle)

with open('X_test.pkl', 'wb') as handle:
    pickle.dump(X_test, handle)

with open('y_train.pkl', 'wb') as handle:
    pickle.dump(y_train, handle)

with open('y_test.pkl', 'wb') as handle:
    pickle.dump(y_test, handle)

# Exploratory

Typically I would do an exploratory analysis here.

In [18]:
X_train.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
29,,,delayed previously,business,0.0,<100,>=7,3.0,male single,none,4.0,no known property,63.0,none,own,2.0,skilled,1.0,yes,yes
535,>=200,21.0,critical/other existing credit,education,2319.0,<100,<1,2.0,male div/sep,none,1.0,car,33.0,none,rent,1.0,skilled,1.0,none,yes
695,no checking,6.0,existing paid,used car,1236.0,500<=X<1000,1<=X<4,2.0,male single,none,4.0,life insurance,50.0,none,rent,1.0,skilled,1.0,none,yes
557,no checking,21.0,no credits/all paid,new car,5003.0,no known savings,1<=X<4,1.0,female div/dep/mar,none,4.0,life insurance,29.0,bank,own,2.0,skilled,1.0,yes,yes
836,no checking,12.0,existing paid,radio/tv,886.0,no known savings,1<=X<4,4.0,female div/dep/mar,none,2.0,car,21.0,none,own,1.0,skilled,1.0,none,yes


In [19]:
y_train[0:10]

array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])

In [20]:
y_train.mean()

0.30125

In [21]:
y_test.mean()

0.295