In [14]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score, f1_score,classification_report 
from sklearn.metrics import roc_curve, auc

import model_team14 
from model_team14 import *

In [25]:
def make_auc_dict(model_name, clf, labels, y_test, y_pred, y_pred_prob):

    dict_auc={'model':[], 'class':[], 'auc':[]}

    # 1이면, 1/2이면 각각의 auc score

    if len(labels)==2:

        #class_probabilities = model.predict_proba(X_test)
#        preds = y_pred_prob[:, 1]

        fpr, tpr, threshold = roc_curve(y_test, y_pred)
        roc_auc = auc(fpr, tpr)

        dict_auc['model'].append(model_name)
        dict_auc['class'].append(0)
        dict_auc['auc'].append(roc_auc)

        dict_auc['model'].append(model_name)
        dict_auc['class'].append(1)
        dict_auc['auc'].append(roc_auc)

    else:
        #def plot_multiclass_roc(clf, X_test, y_test, n_classes, figsize=(17, 6)):
       ## y_score = ### model.predict_proba(X_test)

        #Binarize the output
        y_test_bin = label_binarize(y_test, classes=sorted(labels))
#        print(y_test_bin)
        n_classes = y_test_bin.shape[1]

        fpr = dict()
        tpr = dict()
        roc_auc = dict()
                                    
        for i in range(n_classes):
            if 'lstm' in model_name:
                fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i].detach().numpy())  #y_score[:, i]
            else:
                fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])  #y_score[:, i]
            dict_auc['model'].append(model_name)
            dict_auc['class'].append(i)
            dict_auc['auc'].append(auc(fpr[i], tpr[i]))

    return pd.DataFrame(dict_auc)

In [26]:
def make_finalreport():

    folders=[x for x in os.listdir("../result") if'ft' in x] 

    for i, folder in enumerate(folders):
    #    print(folder)
        reports=[x for x in os.listdir("../result/{}".format(folder)) if 'creport' in x]
        models=[x for x in os.listdir("../result/{}".format(folder)) if 'clf' in x]
        ft_criteria=folder.split('_')[1]
        ft_threshold=folder.split('_')[2]

        path='../result/{}'.format(folder)

        for k, report in enumerate(reports):
            with open (path+'/{}'.format(report), 'rb') as f:
                df=pickle.load(f)
            if k==0:
                df_report=df
            else:    
                df_report=pd.concat([df_report, df])

        df_report['ft_criteria']=ft_criteria    
        df_report['ft_threshold']=ft_threshold
        
        if folder.split('_')[-1]=='nn':
            
            for k, model in enumerate(models):
        #        print(model)
                model_name=model.split('.')[0]
                y_type=model.split('_')[2]+'_'+model.split('_')[3]
                test_year=5
                
#                print(path, folder, model)

                with open (path+'/{}'.format(model), 'rb') as f:
                    [clf, df_cvresult, y_pred, y_pred_prob, clf_report]=pickle.load(f)

                X=pd.read_csv('../data/X_data_tr.csv', index_col='date', parse_dates=True)
                y=pd.read_csv('../data/y_data_tr.csv', index_col='date', parse_dates=True)

                labels=list(y[y_type].unique())    
                y_test=y[y_type][-(test_year*12):]
                
                
                df_tmp=make_auc_dict(model_name, clf, labels, y_test, y_pred, y_pred_prob)
        ##        print('auc df completed')
        ##        print(df_tmp.head())

                if k==0:
                    df_auc=df_tmp
                else:
                    df_auc=pd.concat([df_auc, df_tmp])

            
        else:    

            for j, model in enumerate(models):
        #        print(model)
                model_name=model.split('.')[0]
                dtype=model.split('_')[4]
                y_type=model.split('_')[2]+'_'+model.split('_')[3]
                test_year=5

                with open (path+'/{}'.format(model), 'rb') as f:
                    [clf, df_cvresult, y_pred, y_pred_prob, clf_report]=pickle.load(f)

                if dtype=='tr':
                    X=pd.read_csv('../data/X_data_tr.csv', index_col='date', parse_dates=True)
                    y=pd.read_csv('../data/y_data_tr.csv', index_col='date', parse_dates=True)
                else:
                    X=pd.read_csv('../data/X_data.csv', index_col='date', parse_dates=True)
                    y=pd.read_csv('../data/y_data.csv', index_col='date', parse_dates=True)

                labels=list(y[y_type].unique())    
                y_test=y[y_type][-(test_year*12):]

    ##        print(labels, y_type)

                df_tmp=make_auc_dict(model_name, clf, labels, y_test, y_pred, y_pred_prob)
        ##        print('auc df completed')
        ##        print(df_tmp.head())

                if j==0:
                    df_auc=df_tmp
                else:
                    df_auc=pd.concat([df_auc, df_tmp])

        df_report=df_report.merge(df_auc, left_on=['class','model'], right_on=['class','model'])

        if i==0:
            df_final_report=df_report
        else:
            df_final_report=pd.concat([df_final_report, df_report])
            
    return df_final_report            

In [27]:
df_final_report=make_finalreport()

In [30]:
df=df_final_report.groupby(['ft_criteria','ft_threshold','model']).mean() ##[['auc']].reset_index()
df_selection=df.reset_index().sort_values(['recall','precision','auc'], ascending=[False, False, False])[:10]
df_selection.head(10)

df=df_final_report[df_final_report['class']>0].groupby(['ft_criteria','ft_threshold','model']).mean() ##[['auc']].reset_index()
df_selection=df.reset_index().sort_values(['recall','precision','auc'], ascending=[False, False, False])[:10]
df_selection.head(10)



# df3=df_final_report.groupby(['ft_criteria','ft_threshold','model']).mean() ##[['auc']].reset_index()
# df3.reset_index().sort_values(['recall','f1_score','precision','accuracy'], ascending=[False, False, False, False])[:50]

# df1=df_final_report.groupby(['ft_criteria','ft_threshold','model']).max('class') ##[['auc']].reset_index()
# df1.reset_index().sort_values(['auc','recall','precision','accuracy'], ascending=[False, False, False, False])[:30]


Unnamed: 0,ft_criteria,ft_threshold,model,class,precision,recall,f1_score,support,accuracy,auc
17,,0.2,clf_lstm_y_oecd_18_b5_ep300_h50_n1_dr0,1.0,0.4,1.0,0.57,4.0,0.9,0.946429
20,,0.2,clf_lstm_y_oecd_24_b5_ep300_h100_n1_dr0,1.0,0.36,1.0,0.53,4.0,0.88,0.9375
26,,0.2,clf_rf_y_agg_tr_block_t5_spl3,1.5,0.35,1.0,0.5,2.0,0.83,0.974138
18,,0.2,clf_lstm_y_oecd_18_b5_ep500_h100_n1_dr0,1.0,0.33,1.0,0.5,4.0,0.87,0.928571
19,,0.2,clf_lstm_y_oecd_18_b5_ep500_h50_n1_dr0,1.0,0.33,1.0,0.5,4.0,0.87,0.928571
48,,0.3,clf_lstm_y_oecd_18_b5_ep300_h100_n1_dr0,1.0,0.33,1.0,0.5,4.0,0.87,0.928571
23,,0.2,clf_lstm_y_oecd_24_b5_ep500_h50_n1_dr0,1.0,0.29,1.0,0.44,4.0,0.83,0.910714
70,,0.4,clf_knn_y_oecd_tr_block_t5_spl3,1.0,0.29,1.0,0.44,4.0,0.83,0.910714
71,,0.4,clf_knn_y_oecd_tr_tss_t5_spl3,1.0,0.29,1.0,0.44,4.0,0.83,0.910714
82,,0.4,clf_lstm_y_oecd_18_b5_ep500_h100_n1_dr0,1.0,0.29,1.0,0.44,4.0,0.83,0.910714


In [32]:
df_selection['model_type']=df_selection['model'].apply(lambda x:x.split('_')[1])
df_selection.head(10)

Unnamed: 0,ft_criteria,ft_threshold,model,class,precision,recall,f1_score,support,accuracy,auc,model_type
17,,0.2,clf_lstm_y_oecd_18_b5_ep300_h50_n1_dr0,1.0,0.4,1.0,0.57,4.0,0.9,0.946429,lstm
20,,0.2,clf_lstm_y_oecd_24_b5_ep300_h100_n1_dr0,1.0,0.36,1.0,0.53,4.0,0.88,0.9375,lstm
26,,0.2,clf_rf_y_agg_tr_block_t5_spl3,1.5,0.35,1.0,0.5,2.0,0.83,0.974138,rf
18,,0.2,clf_lstm_y_oecd_18_b5_ep500_h100_n1_dr0,1.0,0.33,1.0,0.5,4.0,0.87,0.928571,lstm
19,,0.2,clf_lstm_y_oecd_18_b5_ep500_h50_n1_dr0,1.0,0.33,1.0,0.5,4.0,0.87,0.928571,lstm
48,,0.3,clf_lstm_y_oecd_18_b5_ep300_h100_n1_dr0,1.0,0.33,1.0,0.5,4.0,0.87,0.928571,lstm
23,,0.2,clf_lstm_y_oecd_24_b5_ep500_h50_n1_dr0,1.0,0.29,1.0,0.44,4.0,0.83,0.910714,lstm
70,,0.4,clf_knn_y_oecd_tr_block_t5_spl3,1.0,0.29,1.0,0.44,4.0,0.83,0.910714,knn
71,,0.4,clf_knn_y_oecd_tr_tss_t5_spl3,1.0,0.29,1.0,0.44,4.0,0.83,0.910714,knn
82,,0.4,clf_lstm_y_oecd_18_b5_ep500_h100_n1_dr0,1.0,0.29,1.0,0.44,4.0,0.83,0.910714,lstm


In [None]:
model_

In [None]:
df_selection.iloc[:5][['ft_criteria','ft_threshold','model']]

In [None]:
criteria=df_selection.iloc[0].ft_criteria
threshold=df_selection.iloc[0].ft_threshold
model=df_selection.iloc[0].model

with open('../result/ft_{}_{}/{}.pkl'.format(criteria, threshold, best_model), 'rb') as f:
    [clf, df_cvresult, y_pred, y_pred_prob, clf_report]=pickle.load(f)

best_model=clf.best_estimator_