In [12]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score, f1_score,classification_report 
from sklearn.metrics import roc_curve, auc


In [13]:
def make_auc_dict(model_name, clf, labels, y_test, y_pred, y_pred_prob):

    dict_auc={'model':[], 'class':[], 'auc':[]}

    # 1이면, 1/2이면 각각의 auc score

    if len(labels)==2:

        #class_probabilities = model.predict_proba(X_test)
#        preds = y_pred_prob[:, 1]

        fpr, tpr, threshold = roc_curve(y_test, y_pred)
        roc_auc = auc(fpr, tpr)

        dict_auc['model'].append(model_name)
        dict_auc['class'].append(0)
        dict_auc['auc'].append(roc_auc)

        dict_auc['model'].append(model_name)
        dict_auc['class'].append(1)
        dict_auc['auc'].append(roc_auc)

    else:
        #def plot_multiclass_roc(clf, X_test, y_test, n_classes, figsize=(17, 6)):
       ## y_score = ### model.predict_proba(X_test)

        #Binarize the output
        y_test_bin = label_binarize(y_test, classes=sorted(labels))
        n_classes = y_test_bin.shape[1]

        fpr = dict()
        tpr = dict()
        roc_auc = dict()
                                    
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])  #y_score[:, i]
            dict_auc['model'].append(model_name)
            dict_auc['class'].append(i)
            dict_auc['auc'].append(auc(fpr[i], tpr[i]))

    return pd.DataFrame(dict_auc)

In [14]:
def make_finalreport():

    folders=[x for x in os.listdir("../result") if'ft' in x] 

    for i, folder in enumerate(folders):
    #    print(folder)
        reports=[x for x in os.listdir("../result/{}".format(folder)) if 'creport' in x]
        models=[x for x in os.listdir("../result/{}".format(folder)) if 'clf' in x]
        ft_criteria=folder.split('_')[1]
        ft_threshold=folder.split('_')[2]


        path='../result/{}'.format(folder)

        for k, report in enumerate(reports):
            with open (path+'/{}'.format(report), 'rb') as f:
                df=pickle.load(f)
            if k==0:
                df_report=df
            else:    
                df_report=pd.concat([df_report, df])

        df_report['ft_criteria']=ft_criteria    
        df_report['ft_threshold']=ft_threshold

        for j, model in enumerate(models):
    #        print(model)
            model_name=model.split('.')[0]
            dtype=model.split('_')[4]
            y_type=model.split('_')[2]+'_'+model.split('_')[3]
            test_year=5

            with open (path+'/{}'.format(model), 'rb') as f:
                [clf, df_cvresult, y_pred, y_pred_prob, clf_report]=pickle.load(f)


            if dtype=='tr':
                X=pd.read_csv('../data/X_data_tr.csv', index_col='date', parse_dates=True)
                y=pd.read_csv('../data/y_data_tr.csv', index_col='date', parse_dates=True)
            else:
                X=pd.read_csv('../data/X_data.csv', index_col='date', parse_dates=True)
                y=pd.read_csv('../data/y_data.csv', index_col='date', parse_dates=True)

            labels=list(y[y_type].unique())    
            y_test=y[y_type][-(test_year*12):]

    ##        print(labels, y_type)

            df_tmp=make_auc_dict(model_name, clf, labels, y_test, y_pred, y_pred_prob)
    ##        print('auc df completed')
    ##        print(df_tmp.head())

            if j==0:
                df_auc=df_tmp
            else:
                df_auc=pd.concat([df_auc, df_tmp])

        df_report=df_report.merge(df_auc, left_on=['class','model'], right_on=['class','model'])

        if i==0:
            df_final_report=df_report
        else:
            df_final_report=pd.concat([df_final_report, df_report])
            
    return df_final_report            

In [15]:
df_final_report=make_finalreport()

In [20]:

df=df_final_report.groupby(['ft_criteria','ft_threshold','model']).mean() ##[['auc']].reset_index()
df_selection=df.reset_index().sort_values(['recall','precision','auc'], ascending=[False, False, False])[:10]
df_selection.head(10)
# df3=df_final_report.groupby(['ft_criteria','ft_threshold','model']).mean() ##[['auc']].reset_index()
# df3.reset_index().sort_values(['recall','f1_score','precision','accuracy'], ascending=[False, False, False, False])[:50]

# df1=df_final_report.groupby(['ft_criteria','ft_threshold','model']).max('class') ##[['auc']].reset_index()
# df1.reset_index().sort_values(['auc','recall','precision','accuracy'], ascending=[False, False, False, False])[:30]


Unnamed: 0,ft_criteria,ft_threshold,model,class,precision,recall,f1_score,support,accuracy,auc
10,,0.2,clf_rf_y_agg_tr_block_t5_spl3,1.0,0.566667,0.94,0.633333,20.0,0.83,0.955229
38,,0.4,clf_knn_y_oecd_tr_block_t5_spl3,0.5,0.645,0.91,0.67,30.0,0.83,0.910714
39,,0.4,clf_knn_y_oecd_tr_tss_t5_spl3,0.5,0.645,0.91,0.67,30.0,0.83,0.910714
54,,0.5,clf_knn_y_oecd_tr_block_t5_spl3,0.5,0.635,0.9,0.655,30.0,0.82,0.901786
55,,0.5,clf_knn_y_oecd_tr_tss_t5_spl3,0.5,0.635,0.9,0.655,30.0,0.82,0.901786
77,cum,0.3,clf_rf_y_oecd_ntr_tss_t5_spl3,0.5,0.865,0.865,0.865,30.0,0.97,0.866071
62,,0.5,clf_rf_y_oecd_tr_block_t5_spl3,0.5,0.595,0.85,0.57,30.0,0.72,0.848214
63,,0.5,clf_rf_y_oecd_tr_tss_t5_spl3,0.5,0.595,0.85,0.57,30.0,0.72,0.848214
29,,0.3,clf_rf_y_oecd_ntr_tss_t5_spl3,0.5,0.575,0.795,0.5,30.0,0.62,0.794643
45,,0.4,clf_rf_y_oecd_ntr_tss_t5_spl3,0.5,0.575,0.795,0.5,30.0,0.62,0.794643


In [31]:
df_selection.iloc[:5][['ft_criteria','ft_threshold','model']]

Unnamed: 0,ft_criteria,ft_threshold,model
10,,0.2,clf_rf_y_agg_tr_block_t5_spl3
38,,0.4,clf_knn_y_oecd_tr_block_t5_spl3
39,,0.4,clf_knn_y_oecd_tr_tss_t5_spl3
54,,0.5,clf_knn_y_oecd_tr_block_t5_spl3
55,,0.5,clf_knn_y_oecd_tr_tss_t5_spl3


In [38]:
criteria=df_selection.iloc[0].ft_criteria
threshold=df_selection.iloc[0].ft_threshold
model=df_selection.iloc[0].model

with open('../result/ft_{}_{}/{}.pkl'.format(criteria, threshold, best_model), 'rb') as f:
    [clf, df_cvresult, y_pred, y_pred_prob, clf_report]=pickle.load(f)

best_model=clf.best_estimator_

{'class_weight': None,
 'max_features': 0.2,
 'n_estimators': 50,
 'warm_start': True,
 'model': 'RF',
 'data': 'tr',
 'y': 'y_agg',
 'cv': 'block',
 'mean_test_score': 0.6325757575757577,
 'rank_test_score': 1}