In [1]:
import pandas as pd
import numpy as np

# Load Data and Overview

In [2]:
key_train = np.load('data/key_train.npy' )
key_test = np.load('data/key_test.npy' )
key_train.shape, key_test.shape

((1521787,), (421665,))

In [3]:
X_col_names = np.load('data/X_col_names.npy' )
X_train = np.load('data/X_train.npy' )
X_test = np.load('data/X_test.npy' )
y_train = np.load('data/y_train.npy' )
n_train = len(y_train)
ind_to_be_del  = np.load('data/ind_to_be_del.npy' )
X_train.shape, X_test.shape

((1521787, 243), (421665, 243))

In [4]:
sample_weight_train = np.load('data/sample_weight.npy' )
sample_weight_train.shape

(1521787,)

In [5]:
# optional 
X_pca = np.load("data/X_pca5.npy")
X = np.concatenate((np.concatenate((X_train,X_test),0), X_pca), 1)
X_train, X_test = X[:n_train, :], X[n_train:, :]
X_train.shape, X_test.shape

((1521787, 248), (421665, 248))

In [6]:
# optional 
X_train = np.concatenate((X_train, np.load("data_201910_1/X_train_CONTSP.npy")),1)
X_test = np.concatenate((X_test, np.load("data_201910_1/X_test_CONTSP.npy")),1)
X = np.concatenate((X_train,X_test), 0)
X_train, X_test = X[:n_train, :], X[n_train:, :]
X_train.shape, X_test.shape

((1521787, 328), (421665, 328))

# Delete y==1 nerghbors

In [7]:
print( "y==1的比率: ", (y_train>0).sum()/len(y_train) )

y==1的比率:  0.013375722095142093


In [8]:
# del some samples
X_train = X_train[~ind_to_be_del]
key_train = key_train[~ind_to_be_del]
y_train = y_train[~ind_to_be_del]
X_train.shape, key_train.shape, y_train.shape

((1488766, 328), (1488766,), (1488766,))

In [9]:
print( "y==1的比率: ", (y_train>0).sum()/len(y_train) )

y==1的比率:  0.01367239713964451


# Down/Up Sampling

In [10]:
X_train_y0, X_train_y1 = X_train[y_train==0], X_train[y_train==1]
X_train_y0.shape[0], X_train_y1.shape[0]

(1468411, 20355)

In [11]:
def sampling(X, n=1):
    assert type(X) == np.ndarray and len(X.shape) >= 2
    rand_idx = np.random.randint(0, X.shape[0], n)
    return X[rand_idx], rand_idx

In [12]:
n_sample = int(X_train_y0.shape[0]/2) #int(np.mean((n_y0, n_y1)))
n_sample

734205

In [13]:
X_train_y0, idx_train_y0 = sampling(X_train_y0, n=n_sample) # downsampling
X_train_y1, idx_train_y1 = sampling(X_train_y1, n=X_train_y1.shape[0]*2) # upsampling
X_train_y0.shape[0], X_train_y1.shape[0]

(734205, 40710)

In [14]:
X_train = np.concatenate((X_train_y0, X_train_y1),0)
y_train = np.concatenate((np.zeros(X_train_y0.shape[0]), np.ones(X_train_y1.shape[0])),0)
idx_train = np.concatenate((idx_train_y0, idx_train_y1),0)

In [15]:
key_train = key_train[idx_train]
sample_weight_train = sample_weight_train[idx_train]
key_train.shape, sample_weight_train.shape

((774915,), (774915,))

In [16]:
X_train.shape, y_train.shape

((774915, 328), (774915,))

# Split Train/Test

In [17]:
from sklearn.model_selection import train_test_split
X_train_, X_val, y_train_, y_val, \
key_train_, key_val, sample_weight_train_, sample_weight_val =  \
train_test_split(X_train, y_train, key_train, sample_weight_train, 
                 test_size=0.3, random_state=42)
X_train_.shape, y_train_.shape, X_val.shape, y_val.shape, 

((542440, 328), (542440,), (232475, 328), (232475,))

# Model - Utils

In [18]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score
from  datetime import datetime
import pickle

def score_n_save_out(model, X, y,  
                     model_name = "",
                     postfix="0", 
                     print_score=True,
                     save_model=True, 
                     save_csv=False):
    
    y_pred = model.predict(X)
    y_pred = (y_pred>0.5).astype(int).flatten()
    y = y.astype(int).flatten()
    print( "X_val y_pred==1 ratio : {:.2f}%".format( sum(y_pred == 1)/len(y_pred)*100 ) )
    
    if print_score:
        f1, p, r = ( f1_score(y, y_pred), precision_score(y, y_pred),  recall_score(y, y_pred))
        print("model: {}, F1-score: {:.4f}, precision: {:.4f}, recall: {:.4f}".format(model_name, f1, p, r))
        
    t_str = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S")
    
    if save_model: 
        pth = f'model/{model_name}_model_{postfix}_{t_str}.pkl'
        pickle.dump(model, open(pth, 'wb'))
        print(f'save model: {pth}')
        
    if save_csv:
        y_pred = model.predict(X_test)
        y_pred = (y_pred>0.5).astype(int).flatten()
        print( "X_test y_pred==1 ratio : {:.2f}%".format( sum(y_pred == 1)/len(y_pred)*100 ) )
        df_op = pd.DataFrame({'txkey':key_test, 'fraud_ind': y_pred})
        pth = f'submit/{model_name}_submission_{postfix}_{t_str}.csv'
        df_op.to_csv(pth, index=False) 
        print(f'save scv: {pth}')
        
    return 

# Model - Machine Learning

In [19]:
from sklearn.ensemble import \
ExtraTreesClassifier, GradientBoostingClassifier, \
RandomForestClassifier, AdaBoostClassifier

  from numpy.core.umath_tests import inner1d


In [21]:
model_classes = [
                 ExtraTreesClassifier(n_estimators=10, class_weight='balanced'),
                 ExtraTreesClassifier(n_estimators=100, class_weight='balanced'), 
                 #ExtraTreesClassifier(n_estimators=1000, class_weight='balanced'),  
                ]
model_names = [
               "ExtraTrees_n=10",  
               "ExtraTrees_n=100", 
               #"ExtraTrees_n=1000",  
              ]

In [22]:
import time
models = []
for model, model_name in zip(model_classes, model_names):
    print('*'*64)
    st = time.time()
    model.fit(X_train_, y_train_, sample_weight=sample_weight_train_)
    models.append(model)
    score_n_save_out(model, X_val, y_val, model_name=model_name, save_csv=True, )
    print(f"time cost: {round((time.time()-st),2)} sec")

****************************************************************
X_val y_pred==1 ratio : 5.18%
model: ExtraTrees_n=10, F1-score: 0.9477, precision: 0.9561, recall: 0.9395
save model: model/ExtraTrees_n=10_model_0_20191005111400.pkl
X_test y_pred==1 ratio : 2.25%
save scv: submit/ExtraTrees_n=10_submission_0_20191005111400.csv
time cost: 34.82 sec
****************************************************************
X_val y_pred==1 ratio : 5.21%
model: ExtraTrees_n=100, F1-score: 0.9497, precision: 0.9555, recall: 0.9441
save model: model/ExtraTrees_n=100_model_0_20191005111901.pkl
X_test y_pred==1 ratio : 2.39%
save scv: submit/ExtraTrees_n=100_submission_0_20191005111901.csv
time cost: 311.48 sec



- submit/ExtraTrees_n=100_submission_0_20191004123336.csv -> 55.5%


# DL Model - Utils

In [23]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score
from  datetime import datetime
def score_n_save_out_dl(model, X, y,  
                     model_name = "DNN",
                     postfix="0", 
                     print_score=True,
                     save_model=True, 
                     save_csv=False):
    
    y_pred = np.argmax( model.predict(X) , 1)
    print(y_pred.shape)

    y = y.astype(int).flatten()
    print( "X_val y_pred==1 ratio : {:.2f}%".format( sum(y_pred == 1)/len(y_pred)*100 ) )
    
    if print_score:
        f1, p, r = ( f1_score(y, y_pred), precision_score(y, y_pred),  recall_score(y, y_pred))
        print("model: {}, F1-score: {:.4f}, precision: {:.4f}, recall: {:.4f}".format(model_name, f1, p, r))
        
    t_str = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S")
    
    if save_model: 
        pth = f'model/model_{postfix}_{t_str}.h5'
        model.save_weights(pth)
        print(f'save model: {pth}')
        
    if save_csv:
        y_pred = np.argmax( model.predict(X_test), 1)
        print( "X_test y_pred==1 ratio : {:.2f}%".format( sum(y_pred == 1)/len(y_pred)*100 ) )
        df_op = pd.DataFrame({'txkey':key_test, 'fraud_ind': y_pred})
        pth = f'submit/{model_name}_submission_{postfix}_{t_str}.csv'
        df_op.to_csv(pth, index=False) 
        print(f'save scv: {pth}')
        
    return 

In [24]:
from tensorflow.keras.utils import to_categorical
y_train__categorical = to_categorical(y_train_, num_classes=2)
y_train__categorical.shape

  from ._conv import register_converters as _register_converters


(542440, 2)

# Model - Deep Learning

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam

In [26]:
model = Sequential()
model.add(Dense(units=2048,                    
           input_dim=X_train.shape[1],                     
           kernel_initializer='he_normal',  
           activation=LeakyReLU(0.1))) 
model.add(BatchNormalization())
model.add(Dense(units=256,                                          
           kernel_initializer='he_normal',  
           activation=LeakyReLU(0.1))) 
model.add(BatchNormalization())
model.add(Dense(units=64,                                        
           kernel_initializer='he_normal', 
           activation=LeakyReLU(0.1))) 
model.add(Dense(units=2,                                        
           kernel_initializer='he_normal', 
           activation='softmax'))

Instructions for updating:
`normal` is a deprecated alias for `truncated_normal`


In [27]:
model.compile(loss='categorical_crossentropy',  
              optimizer=Adam(lr=3e-3),  
              weighted_metrics=['accuracy'])
train_history=model.fit(x=X_train_,    
                        y=y_train__categorical, 
                        sample_weight=sample_weight_train_,
                        epochs=3,                     
                        batch_size=2048, 
                        class_weight={0:0.7, 1:0.3},
                        shuffle=True,
                        #callbacks = callbacks_list,
                        verbose=1)
score_n_save_out_dl(model, X_val, y_val, 
                 postfix="10", save_csv=True, ) # 上繳分數 0.464427

Epoch 1/3
Epoch 2/3
Epoch 3/3
(232475,)
X_val y_pred==1 ratio : 4.62%
model: DNN, F1-score: 0.7517, precision: 0.8044, recall: 0.7054
save model: model/model_10_20191005111937.h5
X_test y_pred==1 ratio : 2.69%
save scv: submit/DNN_submission_10_20191005111937.csv
