In [1]:
import pandas as pd
import numpy as np

# Load Data and Overview

In [2]:
X_col_names = np.load('data/X_col_names.npy' )
X_train = np.load('data/X_train.npy' )
X_test = np.load('data/X_test.npy' )
y_train = np.load('data/y_train.npy' )
X_train_ = np.load('data/X_train_.npy' )
X_val = np.load('data/X_val.npy' )
y_train_ = np.load('data/y_train_.npy' )
y_val = np.load('data/y_val.npy' )
#print(X_col_names)
X_train.shape, X_test.shape, X_train_.shape, X_val.shape

((1521787, 369), (421665, 369), (1065250, 369), (456537, 369))

In [3]:
key_train = np.load('data/key_train.npy' )
key_test = np.load('data/key_test.npy' )
key_train_ = np.load('data/key_train_.npy' )
key_val = np.load('data/key_val.npy' )
key_val[:5]

array([1143723,  724559, 1312059, 1670608, 1770025])

# Model - Utils

In [4]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score
from  datetime import datetime
import pickle

def score_n_save_out(model, X, y,  
                     model_name = "",
                     postfix="0", 
                     print_score=True,
                     save_model=True, 
                     save_csv=False):
    
    y_pred = model.predict(X)
    y_pred = (y_pred>0.5).astype(int).flatten()
    y = y.astype(int).flatten()
    print( "X_val y_pred==1 ratio : {:.2f}%".format( sum(y_pred == 1)/len(y_pred)*100 ) )
    
    if print_score:
        f1, p, r = ( f1_score(y, y_pred), precision_score(y, y_pred),  recall_score(y, y_pred))
        print("model: {}, F1-score: {:.4f}, precision: {:.4f}, recall: {:.4f}".format(model_name, f1, p, r))
        
    t_str = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S")
    
    if save_model: 
        pth = f'model/{model_name}_model_{postfix}_{t_str}.h5'
        pickle.dump(model, open(pth, 'wb'))
        print(f'save model: {pth}')
        
    if save_csv:
        y_pred = model.predict(X_test)
        y_pred = (y_pred>0.5).astype(int).flatten()
        print( "X_test y_pred==1 ratio : {:.2f}%".format( sum(y_pred == 1)/len(y_pred)*100 ) )
        df_op = pd.DataFrame({'txkey':key_test, 'fraud_ind': y_pred})
        pth = f'submit/{model_name}_submission_{postfix}_{t_str}.csv'
        df_op.to_csv(pth, index=False) 
        print(f'save scv: {pth}')
        
    return 

# Model - Machine Learning

In [5]:
from sklearn.ensemble import \
ExtraTreesClassifier, GradientBoostingClassifier, \
RandomForestClassifier, AdaBoostClassifier

  from numpy.core.umath_tests import inner1d


In [6]:
model_classes = [
                 ExtraTreesClassifier(n_estimators=100, class_weight='balanced'), 
                 ExtraTreesClassifier(n_estimators=100, class_weight={0:0.005, 1:0.995}),
                 ExtraTreesClassifier(n_estimators=1000, class_weight='balanced'), 
                 ExtraTreesClassifier(n_estimators=1000, class_weight={0:0.005, 1:0.995}), 
                ]
model_names = [
               "ExtraTrees_n=100",  
               "ExtraTrees_n=100_cw=0.005", 
               "ExtraTrees_n=1000",  
               "ExtraTrees_n=1000_cw=0.005", 
              ]

In [None]:
import time
models = []
for model, model_name in zip(model_classes, model_names):
    print('*'*64)
    st = time.time()
    model.fit(X_train_, y_train_)
    models.append(model)
    score_n_save_out(model, X_val, y_val, model_name=model_name, save_csv=True, )
    print(f"time cost: {round((time.time()-st),2)} sec")
    

****************************************************************
X_val y_pred==1 ratio : 0.94%
model: ExtraTrees_n=100, F1-score: 0.7210, precision: 0.8762, recall: 0.6125
save model: model/ExtraTrees_n=100_model_0_20190924134330.h5
X_test y_pred==1 ratio : 0.95%
save scv: submit/ExtraTrees_n=100_submission_0_20190924134330.csv
time cost: 1042.89 sec
****************************************************************
X_val y_pred==1 ratio : 0.94%
model: ExtraTrees_n=100_cw=0.005, F1-score: 0.7183, precision: 0.8742, recall: 0.6096
save model: model/ExtraTrees_n=100_cw=0.005_model_0_20190924140333.h5
X_test y_pred==1 ratio : 0.94%
save scv: submit/ExtraTrees_n=100_cw=0.005_submission_0_20190924140333.csv
time cost: 1209.81 sec
****************************************************************
X_val y_pred==1 ratio : 0.94%
model: ExtraTrees_n=1000, F1-score: 0.7231, precision: 0.8746, recall: 0.6163
save model: model/ExtraTrees_n=1000_model_0_20190924170415.h5
X_test y_pred==1 ratio : 0.94%
s