In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# !head /kaggle/input/creditscoring-course6/CreditScoring.csv
df = pd.read_csv("/kaggle/input/creditscoring-course6/CreditScoring.csv")
df.columns = df.columns.str.lower()

# convert numbers to string categories:
df.status = df.status.map({1: "ok", 2:"default", 0: "unk"})

home_values = {1: 'rent',2: 'owner',3: 'private',4: 'ignore',5: 'parents',6: 'other',0: 'unk'}
df.home = df.home.map(home_values)

marital_values = {1: 'single',2: 'married',3: 'widow',4: 'separated',5: 'divorced',0: 'unk'}
df.marital = df.marital.map(marital_values)

records_values = {1: 'no',2: 'yes',0: 'unk'}
df.records = df.records.map(records_values)

job_values = {1: 'fixed',2: 'partime',3: 'freelance',4: 'others',0: 'unk'}
df.job = df.job.map(job_values)


# replace max value with NA:
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)
    
# drop lines with unkown status:
df = df[df.status != 'unk'].reset_index(drop=True)    
    
df.head()
df.describe().round()


In [4]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train,  df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.status == 'default').astype('int').values
y_val = (df_val.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

del df_train["status"]
del df_val["status"]
del df_test["status"]

df_train

In [5]:
# Decision Trees
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

train_dict = df_train.fillna(0).to_dict(orient='records')
train_dict[:5]
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

dv.get_feature_names()

dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)



val_dict = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = dt.predict_proba(X_val)[:,1]

roc_auc_score(y_val, y_pred)


In [6]:
#print decision tree
from sklearn.tree import export_text

print(export_text(dt, feature_names=dv.get_feature_names()))


In [7]:
#AUC for different tree depth

#find best depth:
for d in [1,2,3,4,5,6,10,20,None]:
    dt = DecisionTreeClassifier(max_depth=d)
    dt.fit(X_train, y_train)
    
    y_pred = dt.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, y_pred)
    
    print("auc: %.3f, depth: %4s" % (auc, d))
print()

#find min_samples_leaf:
for d in [4,5,6]:
    for s in [1,2,5,10,15,20,100,200,500]:
        dt = DecisionTreeClassifier(max_depth=d, min_samples_leaf=s)
        dt.fit(X_train, y_train)

        y_pred = dt.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_val, y_pred)

        print("auc: %.3f, depth: %4s, min_samples_leaf: %4s" % (auc, d, s))

    

In [8]:
# Random Forest of decision trees
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

train_dict = df_train.fillna(0).to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)

val_dict = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = rf.predict_proba(X_val)[:,1]

roc_auc_score(y_val, y_pred)


In [9]:
from IPython.display import display
import matplotlib.pyplot as plt

# Simulate multiple Random Forests to find best 'n_estimators' and 'max_depth'

scores = []
for d in [5, 10, 15]:
    for n in range(10, 201, 20):
        rf = RandomForestClassifier(n_estimators=n, max_depth=d, random_state=1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_val, y_pred)
        scores.append((d,n,auc))

df_scores = pd.DataFrame(scores, columns=['max_depth','n_estimators','auc'])    
display(df_scores)

# print all auc for each max_depth:
for d in [5, 10, 15]:
    df_scores_subset = df_scores[df_scores.max_depth==d]
    plt.plot(df_scores_subset.n_estimators, df_scores_subset.auc, label='max_depth=%d'%d)

plt.legend()    

In [10]:
best_max_depth = 10 #from previous graph


from IPython.display import display
import matplotlib.pyplot as plt

# Simulate multiple Random Forests to find best 'n_estimators' and 'max_depth'

scores = []
for s in [1, 3, 5, 10, 50]:
    for n in range(10, 201, 20):
        rf = RandomForestClassifier(n_estimators=n, 
                                    max_depth=best_max_depth, 
                                    min_samples_leaf=s,
                                    random_state=1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_val, y_pred)
        scores.append((s,n,auc))

df_scores = pd.DataFrame(scores, columns=['min_samples_leaf','n_estimators','auc'])    
display(df_scores)

# print all auc for each max_depth:
for s in [1, 3, 5, 10, 50]:
    df_scores_subset = df_scores[df_scores.min_samples_leaf==s]
    plt.plot(df_scores_subset.n_estimators, df_scores_subset.auc, label='min_samples_leaf=%d'%s)

plt.legend()    

In [11]:
best_max_depth = 10 #from previous graph
best_min_samples_leaf = 3 #from previous graph
best_n_estimators = 200 # from all graphs above

rf_final = RandomForestClassifier(n_estimators=best_n_estimators, 
                                    max_depth=best_max_depth, 
                                    min_samples_leaf=best_min_samples_leaf,
                                    random_state=1)
rf_final.fit(X_train, y_train)



In [12]:
# XGboost
# !pip install xgboost
import xgboost as xgb

# prepare XGboost data structure:
features = dv.get_feature_names()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

# default xgboost params:
xgb_params = {
    'eta': 0.3,
    'max_debth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'nthread': 8,
    'seed':1,
    'verbosity':1
}
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=10)

y_pred = xgb_model.predict(dval)

roc_auc_score(y_val, y_pred)




In [13]:
%%capture output
# captures stdout ^

# xbgoost auc:

watchlist = [(dtrain, 'train'), (dval, 'val')]

xgb_params = {
    'eta': 0.3,
    'max_debth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed':1,
    'verbosity':0
}
xgb_model = xgb.train(xgb_params, 
                      dtrain, 
                      evals=watchlist,
                      verbose_eval=5,
                      num_boost_round=200)


In [14]:
# parse xgboost AUC values from stdout
def parse_xgb_output(output):
    result = []
    num_iter_arr = []
    train_auc_arr = []
    val_auc_arr = []

    for line in output.stdout.strip().split('\n'):
        num_iter, train_auc, val_auc = line.split('\t')
        num_iter = int(num_iter.strip("[]"))
        train_auc = float(train_auc.split(":")[1])
        val_auc = float(val_auc.split(":")[1])
        result.append((num_iter, train_auc, val_auc))

    columns =  ["num_iter", "train_auc", "val_auc"]   
    return pd.DataFrame(result, columns=columns)

df_model_score = parse_xgb_output(output)

plt.plot(df_model_score.num_iter, df_model_score.train_auc, label="train data")
plt.plot(df_model_score.num_iter, df_model_score.val_auc, label="val data")
plt.legend()



In [27]:
# best model

# decision tree
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)
dt.fit(X_train, y_train)
y_pred = dt.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred)
display("DecisionTreeClassifier: %s" %(auc))

# random forest
rf = RandomForestClassifier(n_estimators=200, 
                            max_depth=best_max_depth, 
                            min_samples_leaf=best_min_samples_leaf,
                            random_state=1)
rf.fit(X_train, y_train)
y_pred = rf.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred)
display("RandomForestClassifier: %s" %(auc))

# gradient boosting
xgb_params = {
    'eta': 0.3,
    'max_debth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'nthread': 8,
    'seed':1,
    'verbosity':0
}
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=10)
y_pred = xgb_model.predict(dval)
auc = roc_auc_score(y_val, y_pred)
display("Xgboost: %s" %(auc))