# Default of Credit Card Clients

# Decision tree type analysis

In [2]:
from joblib import Memory #cache purpose
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
pwd = "./default/"
cache_dir = pwd + 'cache_dir'
mem = Memory(cache_dir)
output_path = './output/lgbm_grd_search/'
try:
    _=os.listdir(output_path)
except:
    os.mkdir(output_path)
from data_loader_default import load_data_default

def to_csv(arr,name):
    df = pd.DataFrame(arr)
    df.to_csv(f'{output_path}/{name}.csv')
from pyts.transformation import BagOfPatterns as BOP

#extractors
from mods_defaults import ElementaryExtractor,BasicTransformer,MyMulPyts, NonTsPass,Passer #for pickle (cache)purpose

from sklearn.pipeline import Pipeline
#classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
# pipeline
from sklearn.pipeline import make_union
# evaluation modules
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay,RocCurveDisplay
from sklearn.metrics import precision_score, recall_score,f1_score, accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.model_selection import GridSearchCV
def my_score(y_train,y_pred):
    accuracy = accuracy_score(y_train,y_pred)
    print(f'accuracy : {accuracy}')
    pre_score = precision_score(y_train,y_pred)
    print(f'precision : {pre_score}')
    rec_score = recall_score(y_train,y_pred)
    print(f'recall : {rec_score}')
    f_score = f1_score(y_train,y_pred)
    print(f'f1_score: {f_score}')

    return [round(accuracy,4), round(pre_score,4), round(rec_score ,4), round(f_score,4)]

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds,ax=None):
    if ax is None:
        plt.plot(thresholds, precisions[:-1], 'b--', label="Precision")
        plt.plot(thresholds, recalls[:-1], 'g-',label="recall")
        plt.xlabel("thresholds")
        plt.legend(loc="upper left")
        plt.ylim([0,1])
    else:
        ax.plot(thresholds, precisions[:-1], 'b--', label="Precision")
        ax.plot(thresholds, recalls[:-1], 'g-',label="recall")
        ax.set_xlabel("thresholds")
        ax.legend(loc="upper left")
        ax.set_ylim([0,1])

In [5]:
my_base_transform = BasicTransformer('')
clf =LGBMClassifier(random_state=42)
columns = [column for column in my_base_transform.raw_columns if column != 'SEX' and column != 'EDUCATION' and column !='MARRIAGE']
extractors = [NonTsPass(),Passer(columns)]
extractor = make_union(*extractors)
pipe = Pipeline([('transfrom',my_base_transform),
                 ('ext',extractor),
                  ('clf',clf)])

```python
def my_function(pred_train,pred_val,score_val,name):
    cf_matrix_t = confusion_matrix(y_train,pred_train)
    cf_matrix_v = confusion_matrix(y_val,pred_val)
    print(cf_matrix_v)
    print('\n'+'train scores')
    my_score(y_train,pred_train)
    print('\n'+'Val scores')
    fig, axes = plt.subplots(2,3,figsize=(30,20))
    RocCurveDisplay.from_predictions(y_val,score_val,ax=axes[0,1])
    display = PrecisionRecallDisplay.from_predictions(y_val,score_val,ax=axes[0,0])
    # display.plot(cmap=plt.cm.Blues,cmap=plt.cm.Blues,ax=axes[0,0])
    display = ConfusionMatrixDisplay(cf_matrix_t)
    display.plot(cmap=plt.cm.Blues,ax=axes[1,0])
    display = ConfusionMatrixDisplay(cf_matrix_v)
    display.plot(cmap=plt.cm.Blues,ax=axes[1,1])
    axes[0,1].set_title('ROC')
    axes[0,0].set_title('PR_curve')
    axes[1,0].set_title('train data')
    axes[1,1].set_title('validation data')
    precisions , recalls , thresholds = precision_recall_curve(y_val, score_val)
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds,ax=axes[0,2])
    axes[0,2].set_title('PR vs threshold')
    fig.suptitle(f'{name}')
    plt.show()
    plt.close()    
```

```python
pipe.get_params()
```

```python
 'clf__boosting_type': 'gbdt',
 'clf__class_weight': None,
 'clf__colsample_bytree': 1.0,
 'clf__importance_type': 'split',
 'clf__learning_rate': 0.1,
 'clf__max_depth': -1,
 'clf__min_child_samples': 20,
 'clf__min_child_weight': 0.001,
 'clf__min_split_gain': 0.0,
 'clf__n_estimators': 100,
 'clf__n_jobs': -1,
 'clf__num_leaves': 31,
 'clf__objective': None,
 'clf__random_state': 42,
 'clf__reg_alpha': 0.0,
 'clf__reg_lambda': 0.0,
 'clf__silent': 'warn',
 'clf__subsample': 1.0,
 'clf__subsample_for_bin': 200000,
 'clf__subsample_freq': 0}
```

In [11]:
param_grid = {
 'clf__boosting_type': ['gbdt','dart','rf'],
 'clf__learning_rate' : [1,0.5,0.1,0.01,0.001],
 'clf__max_depth':[1,2,3,6,10,-1],
 'clf__min_child_samples': [20,30,50,100],
 'clf__n_estimators': [300],
 'clf__num_leaves': [31,20,10,7,3],
 'clf__random_state':[42+i for i in range(0,100,7)],
 'clf__reg_alpha': [0.0,0.1,0.05,0.3],
 'clf__reg_lambda': [0.0,0.1,0.05,0.3],
}
search = GridSearchCV(pipe, 
                      param_grid=param_grid,
                      scoring='accuracy',
                      n_jobs=-1)


load_data = load_data_default()
for num,(X_train ,y_train, X_val, y_val) in enumerate(load_data.yield_data()):
    X_train ,y_train, X_val, y_val = X_train ,y_train, X_val, y_val
    search.fit(X_train,y_train)
    #pd.DataFrame(search.cv_results_).to_csv(output_path+'grd_cv_results.csv')
    pipe_new = search.best_estimator_
    print(f'{num}-th fold \nbest estimator is \n{pipe_new}')
    pipe_new.fit(X_train,y_train)
    #         predictions
    pred_train = pipe_new.predict(X_train)
    pred_val = pipe_new.predict(X_val)
    score_val = pipe_new.predict_proba(X_val)[:,1]
    try:
        _=os.listdir(output_path+str(num))
    except:
        os.mkdir(output_path+str(num))
    to_csv(pred_train,'num/pred_train')
    to_csv(pred_val,'num/pred_val')
    to_csv(score_val,'num/score_val')

# my_function(pred_train,pred_val,score_val,'GradientBoost')