# Default of Credit Card Clients

# Decision tree type analysis

In [1]:
from joblib import Memory #cache purpose
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
pwd = "./default/"
cache_dir = pwd + 'cache_dir'
mem = Memory(cache_dir)
output_path = './output/exclude/'
try:
    _=os.listdir(output_path)
except:
    os.mkdir(output_path)
    

output_path = './output/exclude/DA_gboost_t_series/'
try:
    _=os.listdir(output_path)
except:
    os.mkdir(output_path)


from data_loader_exclude import load_data_default

def to_csv(arr,name):
    df = pd.DataFrame(arr)
    df.to_csv(f'{output_path}/{name}.csv')
from pyts.transformation import BagOfPatterns as BOP

#extractors
from mods_defaults import PositivePerturb,ElementaryExtractor,BasicTransformer,MyMulPyts, NonTsPass,Passer #for pickle (cache)purpose

from sklearn.pipeline import Pipeline
#classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
# pipeline
from sklearn.pipeline import make_union
#Gridsearch
from sklearn.model_selection import GridSearchCV

In [2]:
load_data = load_data_default(exclude=True)

In [3]:
L_BILL = load_data.L_BILL
L_PAY= load_data.L_PAY
L_USAGE = load_data.L_USAGE
L_DIFF = load_data.L_DIFF
D_PAY = load_data.D_PAY

In [4]:
my_perturb = PositivePerturb(random_state=42, dup_num=3)
my_base_transform = BasicTransformer()
length_5_bop =MyMulPyts(estimator=BOP(window_size=3,
                                    word_size=3,
                                    n_bins=3,
                                    sparse=False,
                                    strategy='uniform'),
                          channel_list=[L_USAGE,L_DIFF] )
length_6_bop =MyMulPyts( estimator=BOP(window_size=3,
                                        word_size=3,
                                        sparse=False,
                                        n_bins=3,
                                        strategy='uniform'),
                          channel_list=[D_PAY,L_BILL,L_PAY])
my_elementary_trans_6 = ElementaryExtractor([D_PAY,L_BILL,L_PAY])
my_elementary_trans_5 = ElementaryExtractor([L_USAGE,L_DIFF])




clf = GradientBoostingClassifier(n_estimators=300,learning_rate=0.01,max_depth=2 ,max_features=1.0, min_samples_split=0.01, random_state=555)
columns = [column for column in my_base_transform.log_columns if column != 'SEX' and column != 'EDUCATION' and column !='MARRIAGE']
extractors = [NonTsPass(),Passer(columns),length_5_bop,length_6_bop,my_elementary_trans_5,my_elementary_trans_6]
#extractors = [my_elementary_trans_5,my_elementary_trans_6]
extractor = make_union(*extractors)
pipe = Pipeline([('transfrom',my_base_transform),
                 ('ext',extractor),
                  ('clf',clf)])

In [11]:
try:
    _=os.listdir(output_path+'/aug_3/')
except:
    os.mkdir(output_path+'/aug_3/')
for num,(X_train ,y_train, X_val, y_val) in enumerate(load_data.yield_data()):
    print(X_val.shape)
    X_train ,y_train, X_val, y_val = X_train ,y_train , X_val, y_val
    X_train_original = X_train.copy()
    X_train, y_train = my_perturb.transform(X_train,y_train)
    #print(f'{num}-th fold \nbest estimator is \n{pipe}')
    pipe.fit(X_train,y_train)
    #         predictions
    pred_train = pipe.predict(X_train_original)
    pred_val = pipe.predict(X_val)
    score_val = pipe.predict_proba(X_val)[:,1]
    try:
        _=os.listdir(output_path+'/aug_3/'+str(num))
    except:
        os.mkdir(output_path+'/aug_3/'+str(num))
    to_csv(pred_train,f'/aug_3/{num}/pred_train')
    to_csv(pred_val,f'/aug_3/{num}/pred_val')
    to_csv(score_val,f'/aug_3/{num}/score_val')

# my_function(pred_train,pred_val,score_val,'GradientBoost')

In [12]:
try:
    _=os.listdir(output_path+'/non_aug/')
except:
    os.mkdir(output_path+'/non_aug/')
for num,(X_train ,y_train, X_val, y_val) in enumerate(load_data.yield_data()):
    X_train ,y_train, X_val, y_val = X_train ,y_train , X_val, y_val
    X_train_original = X_train.copy()
   # X_train, y_train = my_perturb.transform(X_train,y_train)
    pipe.fit(X_train,y_train)
    #         predictions
    pred_train = pipe.predict(X_train_original)
    pred_val = pipe.predict(X_val)
    score_val = pipe.predict_proba(X_val)[:,1]
    try:
        _=os.listdir(output_path+'/non_aug/'+str(num))
    except:
        os.mkdir(output_path+'/non_aug/'+str(num))
    to_csv(pred_train,f'/non_aug/{num}/pred_train')
    to_csv(pred_val,f'/non_aug/{num}/pred_val')
    to_csv(score_val,f'/non_aug/{num}/score_val')

# my_function(pred_train,pred_val,score_val,'GradientBoost')

In [13]:
# evaluation modules
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay,RocCurveDisplay
from sklearn.metrics import precision_score, recall_score,f1_score, accuracy_score,roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
from data_loader_default import load_data_default


def my_score(y_train,y_pred):
    accuracy = accuracy_score(y_train,y_pred)
    print(f'accuracy : {accuracy}')
    pre_score = precision_score(y_train,y_pred)
    print(f'precision : {pre_score}')
    rec_score = recall_score(y_train,y_pred)
    print(f'recall : {rec_score}')
    f_score = f1_score(y_train,y_pred)
    print(f'f1_score: {f_score}')

    return [round(accuracy,4), round(pre_score,4), round(rec_score ,4), round(f_score,4)]



def plot_precision_recall_vs_threshold(precisions, recalls, thresholds,ax=None):
    if ax is None:
        plt.plot(thresholds, precisions[:-1], 'b--', label="Precision")
        plt.plot(thresholds, recalls[:-1], 'g-',label="recall")
        plt.xlabel("thresholds")
        plt.legend(loc="upper left")
        plt.ylim([0,1])
    else:
        ax.plot(thresholds, precisions[:-1], 'b--', label="Precision")
        ax.plot(thresholds, recalls[:-1], 'g-',label="recall")
        ax.set_xlabel("thresholds")
        ax.legend(loc="upper left")
        ax.set_ylim([0,1])


X_train,y_train,X_val,y_val = 0,0,0,0
#def plot_my_graphs(pred_train,pred_val,score_val,name=None):
score_dataframes = []
for num,(X_train ,y_train, X_val, y_val) in enumerate(load_data.yield_data()):
    pred_train = pd.read_csv(output_path+'/aug_3/'+str(num)+'/pred_train.csv',index_col=0)
    pred_val = pd.read_csv(output_path+'/aug_3/'+str(num)+'/pred_val.csv',index_col=0)
    score_val = pd.read_csv(output_path+'/aug_3/'+str(num)+'/score_val.csv',index_col=0)
    scores =pd.Series(my_score(y_val,pred_val)+[round(roc_auc_score(y_val,score_val),4)]
                        ,index=['acc','prec','rec','f1','roc_auc'])
    score_dataframes.append(scores)

evals_frame = pd.concat(score_dataframes, axis=1).transpose()
evals_frame.to_csv(output_path+'/evals.csv')


accuracy : 0.7788888888888889
precision : 0.0
recall : 0.0
f1_score: 0.0
accuracy : 0.7788888888888889
precision : 0.0
recall : 0.0
f1_score: 0.0
accuracy : 0.7788888888888889
precision : 0.0
recall : 0.0
f1_score: 0.0
accuracy : 0.7786666666666666
precision : 0.0
recall : 0.0
f1_score: 0.0
accuracy : 0.7786666666666666
precision : 0.0
recall : 0.0
f1_score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
evals_frame

Unnamed: 0,acc,prec,rec,f1,roc_auc
0,0.7789,0.0,0.0,0.0,0.5
1,0.7789,0.0,0.0,0.0,0.5
2,0.7789,0.0,0.0,0.0,0.5
3,0.7787,0.0,0.0,0.0,0.5
4,0.7787,0.0,0.0,0.0,0.5


In [15]:
score_dataframes = []
for num,(X_train ,y_train, X_val, y_val) in enumerate(load_data.yield_data()):
    pred_train = pd.read_csv(output_path+"/non_aug/"+str(num)+'/pred_train.csv',index_col=0)
    pred_val = pd.read_csv(output_path+"/non_aug/"+str(num)+'/pred_val.csv',index_col=0)
    score_val = pd.read_csv(output_path+"/non_aug/"+str(num)+'/score_val.csv',index_col=0)
    scores =pd.Series(my_score(y_val,pred_val)+[round(roc_auc_score(y_val,score_val),4)]
                        ,index=['acc','prec','rec','f1','roc_auc'])
    score_dataframes.append(scores)

evals_frame = pd.concat(score_dataframes, axis=1).transpose()
evals_frame.to_csv(output_path+'raw_evals.csv')


accuracy : 0.7788888888888889
precision : 0.0
recall : 0.0
f1_score: 0.0
accuracy : 0.7788888888888889
precision : 0.0
recall : 0.0
f1_score: 0.0
accuracy : 0.7788888888888889
precision : 0.0
recall : 0.0
f1_score: 0.0
accuracy : 0.7786666666666666
precision : 0.0
recall : 0.0
f1_score: 0.0
accuracy : 0.7786666666666666
precision : 0.0
recall : 0.0
f1_score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
evals_frame