In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
df = pd.read_csv("data/CreditScoring.csv")
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


### Data Description

* status: whether the customer managed to pay back the loan (1) or not (2)
* seniority: job experience in years
* home: type of homeownership: renting (1), a homeowner (2), and others
* time: period planned for the loan (in months)
* age: age of the client
* marital [status]: single (1), married (2), and others
* records: whether the client has any previous records: no (1), yes (2) (It’s not
clear from the dataset description what kind of records we have in this column.
For the purposes of this project, we may assume that it’s about records in the
bank’s database.)
* job: type of job: full-time (1), part-time (2), and others
* expenses: how much the client spends per month
* income: how much the client earns per month
* assets: total worth of all the assets of the client
* debt: amount of credit debt
* amount: requested amount of the loan
* price: price of an item the client wants to buy

### Data cleaning

In [40]:
df.columns = df.columns.str.lower()

In [41]:
status_values = {
    1: "ok",
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,1,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,1,36,26,1,1,1,46,107,0,0,310,910


In [42]:
home_values = {
1: 'rent',
2: 'owner',
3: 'private',
4: 'ignore',
5: 'parents',
6: 'other',
0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
1: 'single',
2: 'married',
3: 'widow',
4: 'separated',
5: 'divorced',
0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
1: 'no',
2: 'yes',
0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
1: 'fixed',
2: 'parttime',
3: 'freelance',
4: 'others',
0: 'unk'
}
df.job = df.job.map(job_values)

df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [43]:
# Check summary stats

df.describe().round().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seniority,4455.0,8.0,8.0,0.0,2.0,5.0,12.0,48.0
time,4455.0,46.0,15.0,6.0,36.0,48.0,60.0,72.0
age,4455.0,37.0,11.0,18.0,28.0,36.0,45.0,68.0
expenses,4455.0,56.0,20.0,35.0,35.0,51.0,72.0,180.0
income,4455.0,763317.0,8703625.0,0.0,80.0,120.0,166.0,99999999.0
assets,4455.0,1060341.0,10217569.0,0.0,0.0,3500.0,6000.0,99999999.0
debt,4455.0,404382.0,6344253.0,0.0,0.0,0.0,0.0,99999999.0
amount,4455.0,1039.0,475.0,100.0,700.0,1000.0,1300.0,5000.0
price,4455.0,1463.0,628.0,105.0,1118.0,1400.0,1692.0,11140.0


In [44]:
# Fixing missing values (99999999)
# Replacing with nan
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace = 99999999, value = np.nan)

In [45]:
df.describe().round().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seniority,4455.0,8.0,8.0,0.0,2.0,5.0,12.0,48.0
time,4455.0,46.0,15.0,6.0,36.0,48.0,60.0,72.0
age,4455.0,37.0,11.0,18.0,28.0,36.0,45.0,68.0
expenses,4455.0,56.0,20.0,35.0,35.0,51.0,72.0,180.0
income,4421.0,131.0,86.0,0.0,80.0,120.0,165.0,959.0
assets,4408.0,5403.0,11573.0,0.0,0.0,3000.0,6000.0,300000.0
debt,4437.0,343.0,1246.0,0.0,0.0,0.0,0.0,30000.0
amount,4455.0,1039.0,475.0,100.0,700.0,1000.0,1300.0,5000.0
price,4455.0,1463.0,628.0,105.0,1118.0,1400.0,1692.0,11140.0


In [46]:
df.status.value_counts()

ok         3200
default    1254
unk           1
Name: status, dtype: int64

In [47]:
df = df.loc[df.status != 'unk', :]

In [48]:
df.status.value_counts()

ok         3200
default    1254
Name: status, dtype: int64

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4454 entries, 0 to 4454
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   status     4454 non-null   object 
 1   seniority  4454 non-null   int64  
 2   home       4454 non-null   object 
 3   time       4454 non-null   int64  
 4   age        4454 non-null   int64  
 5   marital    4454 non-null   object 
 6   records    4454 non-null   object 
 7   job        4454 non-null   object 
 8   expenses   4454 non-null   int64  
 9   income     4420 non-null   float64
 10  assets     4407 non-null   float64
 11  debt       4436 non-null   float64
 12  amount     4454 non-null   int64  
 13  price      4454 non-null   int64  
dtypes: float64(3), int64(6), object(5)
memory usage: 522.0+ KB


In [50]:
cat_cols = df.columns[(df.dtypes == "object")].to_list()
cat_cols

['status', 'home', 'marital', 'records', 'job']

In [51]:
num_cols = df.columns[(df.dtypes == "int64")|(df.dtypes == "float64")].to_list()
num_cols

['seniority',
 'time',
 'age',
 'expenses',
 'income',
 'assets',
 'debt',
 'amount',
 'price']

### Dataset preparation

In [52]:
from sklearn.model_selection import train_test_split

In [53]:
from random import random


df_train_full, df_test = train_test_split(df,
                    test_size=0.2,
                    random_state=11)

df_train, df_val = train_test_split(df_train_full,
                    test_size = 0.25,
                    random_state = 11)

In [54]:
len(df_train), len(df_val), len(df_test)

(2672, 891, 891)

In [55]:
y_train = (df_train.status == "default").values
y_test = (df_test.status == "default").values
y_val = (df_val.status == "default").values

In [56]:
del df_train["status"]
del df_val["status"]

In [57]:
df_train.isna().sum()

seniority     0
home          0
time          0
age           0
marital       0
records       0
job           0
expenses      0
income       25
assets       30
debt         11
amount        0
price         0
dtype: int64

In [58]:
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)

### One hot encoding

In [59]:
dict_train = df_train.to_dict(orient = "records")
dict_val = df_val.to_dict(orient = "records")

dict_train

[{'seniority': 10,
  'home': 'owner',
  'time': 36,
  'age': 36,
  'marital': 'married',
  'records': 'no',
  'job': 'freelance',
  'expenses': 75,
  'income': 0.0,
  'assets': 10000.0,
  'debt': 0.0,
  'amount': 1000,
  'price': 1400},
 {'seniority': 6,
  'home': 'parents',
  'time': 48,
  'age': 32,
  'marital': 'single',
  'records': 'yes',
  'job': 'fixed',
  'expenses': 35,
  'income': 85.0,
  'assets': 0.0,
  'debt': 0.0,
  'amount': 1100,
  'price': 1330},
 {'seniority': 1,
  'home': 'parents',
  'time': 48,
  'age': 40,
  'marital': 'married',
  'records': 'no',
  'job': 'fixed',
  'expenses': 75,
  'income': 121.0,
  'assets': 0.0,
  'debt': 0.0,
  'amount': 1320,
  'price': 1600},
 {'seniority': 1,
  'home': 'parents',
  'time': 48,
  'age': 23,
  'marital': 'single',
  'records': 'no',
  'job': 'parttime',
  'expenses': 35,
  'income': 72.0,
  'assets': 0.0,
  'debt': 0.0,
  'amount': 1078,
  'price': 1079},
 {'seniority': 5,
  'home': 'owner',
  'time': 36,
  'age': 46,
  '

In [60]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse = False)
X_train = dv.fit_transform(dict_train)
X_tval = dv.transform(dict_val)

## Decision trees