In [1]:
import pandas as pd
import numpy as np

In [2]:
! mkdir data model submit

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘model’: File exists
mkdir: cannot create directory ‘submit’: File exists


# Load Data and Overview

In [3]:
df = pd.read_csv('train.csv').set_index('txkey')
df.shape

(1521787, 22)

In [4]:
y_train = df['fraud_ind']
del df['fraud_ind']

In [5]:
df_test = pd.read_csv('test.csv').set_index('txkey')
df_test.shape

(421665, 21)

In [6]:
df = pd.concat((df, df_test), 0)
df.shape

(1943452, 21)

In [7]:
del df['locdt'] #授權日期 

In [8]:
time2val = lambda x: np.sin((x/12-1)*np.pi)
def fn(x): 
    x_str = str(int(x)).zfill(6)
    h, m, s = float(x_str[:2]), float(x_str[2:4]), float(x_str[4:])
    v = h + m/60 + s/3600
    return time2val(v)
df['loctm']=  df['loctm'].apply(fn)# 授權時間

In [9]:
df['mchno'] =  (df['mchno']!=0).astype(int) #特店代號 

In [10]:
df['ovrlt'] =  (df['ovrlt']=="Y").astype(int) # 超額註記碼  
df['flbmk'] =  (df['flbmk']=="Y").astype(int) # Fallback註記 
df['flg_3dsmk'] =  (df['flg_3dsmk']=="Y").astype(int) # 超額註記碼 
df['insfg'] =  (df['insfg']=="Y").astype(int) # 分期交易註記 
df['ecfg'] =  (df['ecfg']=="Y").astype(int) # 網路交易註記 

In [11]:
df.keys()

Index(['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'ecfg', 'etymd',
       'flbmk', 'flg_3dsmk', 'hcefg', 'insfg', 'iterm', 'loctm', 'mcc',
       'mchno', 'ovrlt', 'scity', 'stocn', 'stscd'],
      dtype='object')

In [12]:
# 欄位類別判定
col_names_cont_ = ['loctm','mchno','iterm','ovrlt','flbmk','flg_3dsmk','insfg','ecfg'] # 數值型資料
col_names_cont = [] # 數值型資料
col_names_disc = [] # 類別型資料
col_has_na = [] # 待補NA資料
for c in df.keys():
    print("*"*64)
    if c in col_names_cont_:
        print(c, ": cont")
        continue
    uni = df[c].unique()
    n_na = pd.isna(df[c]).sum() # NA 數量
    if n_na > 0:
        col_has_na.append(c)       
    if len(uni) < 200 :
        print(f"{c}: uni={uni}")
        if not (c in col_names_disc): 
            col_names_disc.append(c)
            print(c, ": disc")
    else:
        info = [ df[c].max(), df[c].min(), df[c].mean(), df[c].std()]
        info = [ x.round(2) for x in info ]
        offset = 1 if n_na else 0
        diversity = (len(uni)-offset)/(len(df)-n_na)
        print(f"{c}: dtype={df[c].dtype}, n_na={n_na}")
        print("       max={}, min={}, mean={}, std={}, diversity={:.2f}%".format(*info, diversity*100 ) )
        if diversity == 1.0:
            print(f"       Delete col [{c}] due to diversity is 100% ")
        else:
            if not (c in col_names_cont): 
                col_names_cont.append(c)
                print(c, ": cont")

****************************************************************
acqic: dtype=int64, n_na=0
       max=6884, min=0, mean=6004.54, std=1505.74, diversity=0.35%
acqic : cont
****************************************************************
bacno: dtype=int64, n_na=0
       max=163886, min=0, mean=82087.41, std=47323.48, diversity=8.40%
bacno : cont
****************************************************************
cano: dtype=int64, n_na=0
       max=213575, min=0, mean=109045.28, std=61103.56, diversity=10.93%
cano : cont
****************************************************************
conam: dtype=float64, n_na=0
       max=7208.77, min=0.0, mean=651.62, std=403.22, diversity=4.44%
conam : cont
****************************************************************
contp: uni=[5 3 6 2 4 0 1]
contp : disc
****************************************************************
csmcu: uni=[ 0 62 74 61 13 67 60 26 49 10 71 56 20 38 22  4 47 54 75 32 66 29 28 14
 50 30 31 45 24 40 35 59  3 48 63 52 41 23 17

In [13]:
col_names_cont

['acqic', 'bacno', 'cano', 'conam', 'mcc', 'scity']

In [14]:
col_names_disc

['contp', 'csmcu', 'etymd', 'hcefg', 'stocn', 'stscd']

# Data Preprocessing

## 增加群組資訊

In [15]:
%%time
from scipy import stats
q_cnames = ['bacno', 'cano' ] 
k_cnames = [ 'csmcu', #消費地幣別 
            'contp', #交易類別 
            'etymd', #交易型態 
            'stocn', #消費地國別 
            'scity', #消費城市
            'hcefg', #支付形態 
            'acqic' #收單行代碼 
           ]
col_names_new = []
for q_cname in q_cnames:
    for k_cname in k_cnames:
        v_table = df[[k_cname, q_cname]].groupby(q_cname).agg(lambda x: stats.mode(x)[0][0])
        s = df[q_cname].apply(lambda q: v_table.loc[q, k_cname])
        new_cname = "issame_"+k_cname
        df[new_cname] = (s == df[k_cname]).astype(int)
        col_names_new.append(new_cname)
    
del df['acqic']; del df['bacno']; del df['cano']
col_names_cont.remove('acqic'); col_names_cont.remove('bacno'); col_names_cont.remove('cano')

CPU times: user 8min 39s, sys: 4.52 s, total: 8min 43s
Wall time: 8min 43s


In [16]:
col_names_new

['issame_csmcu',
 'issame_contp',
 'issame_etymd',
 'issame_stocn',
 'issame_scity',
 'issame_hcefg',
 'issame_acqic',
 'issame_csmcu',
 'issame_contp',
 'issame_etymd',
 'issame_stocn',
 'issame_scity',
 'issame_hcefg',
 'issame_acqic']

In [17]:
df_new = df[col_names_new].copy()

## 數值型

In [18]:
col_names_cont += col_names_cont_

In [19]:
df_cont = df[col_names_cont].copy()
df_cont.head()

Unnamed: 0_level_0,conam,mcc,scity,loctm,mchno,iterm,ovrlt,flbmk,flg_3dsmk,insfg,ecfg
txkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
516056,513.8,457,0,0.989568,1,0,0,0,0,0,0
4376,465.62,451,5817,-0.295569,0,0,0,0,0,0,0
483434,513.8,457,0,0.779793,1,0,0,0,0,0,0
1407164,1016.11,247,3281,0.991311,1,0,0,0,0,0,0
1051004,713.66,263,5817,0.99561,1,0,0,0,0,0,0


In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_cont = scaler.fit_transform(df_cont)
X_cont.shape, type(X_cont)

((1943452, 11), numpy.ndarray)

In [21]:
df_cont = pd.DataFrame(data=X_cont, index=df.index, columns=col_names_cont)
df_cont.head()

Unnamed: 0_level_0,conam,mcc,scity,loctm,mchno,iterm,ovrlt,flbmk,flg_3dsmk,insfg,ecfg
txkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
516056,-0.341794,2.028863,-2.398364,1.076462,0.184114,-0.133894,-0.114425,-0.057458,-0.213247,-0.166861,-0.587517
4376,-0.461282,1.952344,0.536031,-1.027989,-5.431423,-0.133894,-0.114425,-0.057458,-0.213247,-0.166861,-0.587517
483434,-0.341794,2.028863,-2.398364,0.73295,0.184114,-0.133894,-0.114425,-0.057458,-0.213247,-0.166861,-0.587517
1407164,0.903959,-0.649316,-0.743258,1.079317,0.184114,-0.133894,-0.114425,-0.057458,-0.213247,-0.166861,-0.587517
1051004,0.153869,-0.445264,0.536031,1.086356,0.184114,-0.133894,-0.114425,-0.057458,-0.213247,-0.166861,-0.587517


## 類別型

In [22]:
df_disc = df[col_names_disc].copy()
df_disc.head()

Unnamed: 0_level_0,contp,csmcu,etymd,hcefg,stocn,stscd
txkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
516056,5,0,0,5,102,0
4376,5,0,2,0,102,0
483434,5,0,0,5,102,0
1407164,5,62,5,5,102,0
1051004,5,62,4,5,102,0


In [23]:
df_disc.shape

(1943452, 6)

In [24]:
from sklearn.preprocessing import LabelEncoder
les = {}
for c in col_names_disc:
    le = LabelEncoder()
    df_disc.loc[:,c] = le.fit_transform(df_disc.loc[:,c])
    les.update({c:le})
df_disc.head()

Unnamed: 0_level_0,contp,csmcu,etymd,hcefg,stocn,stscd
txkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
516056,5,0,0,5,102,0
4376,5,0,2,0,102,0
483434,5,0,0,5,102,0
1407164,5,62,5,5,102,0
1051004,5,62,4,5,102,0


In [25]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
X_disc = ohe.fit_transform(df_disc)
cut_point = ohe.feature_indices_
print("feature cut point: ", cut_point)
X_disc.shape, type(X_disc)

feature cut point:  [  0   7  83  94 104 213 218]


((1943452, 218), numpy.ndarray)

In [26]:
new_col_names_disc = []
for c in col_names_disc: 
    le = les[c]
    new_col_names_disc += [ c+'_'+str(cl) for cl in le.classes_ ]
assert len(new_col_names_disc) == X_disc.shape[1]

In [27]:
df_disc = pd.DataFrame(data=X_disc, index=df.index, columns=new_col_names_disc)
df_disc.head()

Unnamed: 0_level_0,contp_0,contp_1,contp_2,contp_3,contp_4,contp_5,contp_6,csmcu_0,csmcu_1,csmcu_2,...,stocn_104,stocn_105,stocn_106,stocn_107,stocn_108,stscd_0,stscd_1,stscd_2,stscd_3,stscd_4
txkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
516056,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4376,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
483434,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1407164,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1051004,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [28]:
df_disc.keys()

Index(['contp_0', 'contp_1', 'contp_2', 'contp_3', 'contp_4', 'contp_5',
       'contp_6', 'csmcu_0', 'csmcu_1', 'csmcu_2',
       ...
       'stocn_104', 'stocn_105', 'stocn_106', 'stocn_107', 'stocn_108',
       'stscd_0', 'stscd_1', 'stscd_2', 'stscd_3', 'stscd_4'],
      dtype='object', length=218)

## 合併

In [29]:
df = pd.concat((df_cont, df_disc, df_new ), 1)
del df_cont
del df_disc
del df_new
df.shape

(1943452, 243)

# Data Tree Encoding

# Save data

In [30]:
n_train = len(y_train)
y_train = np.array(y_train)

In [31]:
X_train, X_test = df.iloc[:n_train, :].values, df.iloc[n_train:, :].values
key_train, key_test = np.array(df.index[:n_train]), np.array(df.index[n_train:])
X_train.shape, X_test.shape, y_train.shape

((1521787, 243), (421665, 243), (1521787,))

In [32]:
np.save('data/X_train', X_train)
np.save('data/X_test', X_test)
np.save('data/y_train', y_train)
np.save('data/X_col_names', np.array(df.columns))

In [33]:
# txkey
np.save('data/key_train', key_train) 
np.save('data/key_test', key_test) 

# Delete y==1 Neighbors

## Train PCA

In [34]:
%%time
# 資料 normalize
X_train_  = X_train / np.linalg.norm( X_train, axis=1, keepdims=True)
X_test_  = X_test / np.linalg.norm(X_test, axis=1, keepdims=True)

CPU times: user 1.56 s, sys: 2.49 s, total: 4.05 s
Wall time: 4.05 s


In [35]:
%%time
from sklearn.decomposition import PCA
pca = PCA(5)
X_pca = pca.fit_transform( np.concatenate((X_train_,X_test_),0) )
np.save("data/X_pca5", X_pca)
X_train_pca, X_test_pca = X_pca[:n_train, :], X_pca[n_train:, :]
print( X_train_pca.shape, X_test_pca.shape )

(1521787, 5) (421665, 5)
CPU times: user 36.6 s, sys: 23.5 s, total: 1min
Wall time: 46.5 s


In [36]:
key_train_y1 = key_train[y_train>0]
print( "原本 y==1 的數量:", key_train_y1.shape ) # 原本 y==1 的數量
print( "原本 y==1 的位置:" , np.where(y_train )[0]) # 原本 y==1 的位置

原本 y==1 的數量: (20355,)
原本 y==1 的位置: [     70     349    1133 ... 1521718 1521751 1521766]


## Build Search NN Tree

In [37]:
k = 5 # 設定:搜尋鄰居數

In [38]:
%%time
from sklearn.neighbors import KDTree #建立搜尋樹
tree = KDTree(X_train_pca, leaf_size=10)   
distance, ind = tree.query(X_train_pca, k=k)                
print(ind.shape)  # (1521787, k) indices of 3 closest neighbors

(1521787, 5)
CPU times: user 24.3 s, sys: 159 ms, total: 24.5 s
Wall time: 24.5 s


In [39]:
y_train_new = np.sum( y_train[ind], 1) 
print( "鄰居有 y==1 的數量:", np.sum( y_train_new>0 ) ) # 鄰居有 y==1 的數量
ind_to_be_del = np.logical_and( y_train==0, y_train_new>0 ) # 紀錄要刪掉的index
print( "要刪掉的index為, 原本y==0, 但鄰居有 y==1 的數量: ", ind_to_be_del.sum() ) # 原本y==0, 但鄰居有 y==1 的數量

鄰居有 y==1 的數量: 53376
要刪掉的index為, 原本y==0, 但鄰居有 y==1 的數量:  33021


In [40]:
np.save("data/ind_to_be_del",ind_to_be_del)

# Get Sample weight

## Re-Train PCA with Test

In [41]:
%%time
from sklearn.decomposition import PCA
pca = PCA(5)
X_pca_test = pca.fit_transform( X_test_ )

CPU times: user 8.25 s, sys: 5.82 s, total: 14.1 s
Wall time: 10.2 s


In [42]:
%%time
X_pca_train = pca.transform( X_train_ )

CPU times: user 1.85 s, sys: 1.24 s, total: 3.09 s
Wall time: 3.09 s


In [43]:
np.save("data/X_pca5_train_byTest", X_pca_train)
np.save("data/X_pca5_test_byTest", X_pca_test)

In [44]:
X_pca_test[0:5]

array([[ 0.48099684, -0.36072041,  0.1215331 , -0.11239425, -0.20731387],
       [ 0.4891532 , -0.35281456,  0.09903807, -0.1196086 , -0.20807677],
       [ 0.44471283, -0.38910039,  0.21580922, -0.0779264 , -0.19938505],
       [ 0.44480564, -0.3890408 ,  0.21557916, -0.07801912, -0.19941402],
       [ 0.4809123 , -0.3607992 ,  0.12176357, -0.11231837, -0.20730385]])

## Find X_test Neighbors

In [45]:
k = 10 # 設定:搜尋鄰居數

In [46]:
%%time
from sklearn.neighbors import KDTree #建立搜尋樹
tree = KDTree(X_pca_test, leaf_size=10)   
distance, ind = tree.query(X_pca_train, k=k)                
print(distance.shape)  # (1521787, k) indices of 3 closest neighbors

(1521787, 10)
CPU times: user 24.4 s, sys: 252 ms, total: 24.6 s
Wall time: 24.6 s


In [47]:
distance[0:5]

array([[1.15731285e-05, 1.67958490e-05, 1.96172733e-05, 2.86168577e-05,
        3.14642492e-05, 3.14642492e-05, 3.71637784e-05, 3.93387836e-05,
        5.42985278e-05, 6.28856757e-05],
       [2.16970413e-02, 2.20837352e-02, 2.31202028e-02, 2.32524815e-02,
        2.33246054e-02, 2.34567811e-02, 2.35048285e-02, 2.35768829e-02,
        2.35768829e-02, 2.36129027e-02],
       [7.80727526e-07, 1.27236836e-05, 1.27236836e-05, 1.30814699e-05,
        6.42107969e-05, 1.16154316e-04, 1.41939756e-04, 1.80020128e-04,
        1.80627737e-04, 2.18600907e-04],
       [5.53016239e-03, 1.40491381e-02, 1.50324284e-02, 1.52318860e-02,
        1.72769380e-02, 1.74969056e-02, 1.83946505e-02, 1.94781149e-02,
        1.98372340e-02, 1.99291204e-02],
       [4.66439852e-04, 2.01507260e-03, 2.04588592e-03, 3.20599273e-03,
        6.29108082e-03, 7.08110840e-03, 7.25627581e-03, 7.72822133e-03,
        8.18831140e-03, 9.49623959e-03]])

In [48]:
# train與test距離越近, sample_weight就越高
distance_mean = np.mean(distance, 1)
distance_mean[:5]

array([3.33218372e-05, 2.31206344e-02, 9.40863206e-05, 1.62256578e-02,
       5.37746285e-03])

In [49]:
sample_weight  = distance_mean / np.linalg.norm( distance_mean )
sample_weight

array([8.22670902e-07, 5.70817060e-04, 2.32286346e-06, ...,
       9.91645610e-05, 1.94114699e-04, 9.72874154e-04])

In [50]:
np.save("data/sample_weight",sample_weight)