In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

train_identity = pd.read_csv('/Users/isuhyeon/IEE-CIS_Fraud_Detection/train_identity.csv')
train_transaction = pd.read_csv('/Users/isuhyeon/IEE-CIS_Fraud_Detection/train_transaction.csv')
test_identity = pd.read_csv('/Users/isuhyeon/IEE-CIS_Fraud_Detection/test_identity.csv')
test_transaction = pd.read_csv('/Users/isuhyeon/IEE-CIS_Fraud_Detection/test_transaction.csv')
test_identity.columns = [col.replace('-','_')for col in test_identity.columns]

train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [2]:
del train_identity,test_identity,train_transaction,test_transaction 

# 전처리 1 [컬럼drop(null_80%이상)]

In [3]:
dff_column_to_drop = train.columns[(train.isna().sum() / len(train) > 0.2)]

In [4]:
def preprocess(df) :
    df = df.drop(columns=dff_column_to_drop).copy()
    # object 범주형 자료 결측치 - 최빈값으로 채워 넣기
    # df.ProductCD.value_counts().index # ['W', 'C', 'R', 'H', 'S']
    df.ProductCD.fillna(df.ProductCD.mode(), inplace=True)
    from sklearn.preprocessing import LabelEncoder
    lbl_enc = LabelEncoder()
    lbl_enc.fit(df[['ProductCD']])
    df['ProductCD'] = lbl_enc.transform(df[['ProductCD']])
    # df.card4.value_counts().index # ['visa', 'mastercard', 'american express', 'discover']
    df.card4.fillna(df.card4.mode(), inplace=True)
    lbl_enc.fit(df[['card4']])
    df['card4'] = lbl_enc.transform(df[['card4']])
    # df.card6.value_counts().index # ['debit', 'credit', 'debit or credit', 'charge card']
    df.card6.fillna(df.card6.mode(), inplace=True)
    lbl_enc.fit(df[['card6']])
    df['card6'] = lbl_enc.transform(df[['card6']])
    # df.P_emaildomain.value_counts().index    # [ gmail yahoo etc   ]
    df.P_emaildomain.fillna(df.P_emaildomain.mode(), inplace=True)
    lbl_enc.fit(df[['P_emaildomain']])
    df['P_emaildomain'] = lbl_enc.transform(df[['P_emaildomain']])

    #NULL값이 있는지 한눈에 보기
    display(np.sum(df.isnull() ).to_frame().transpose() )
    display(f'지금 {np.sum(np.sum(df.isnull()))} 개의 null값이 있습니다.' )


    # continuous 연속형 자료 결측치 추가
    # 방법 1 - 중앙값으로 채워 넣기
    temp_list_1 = df.columns[df.isnull().sum() > 0]  # 결측치가 있는 컬럼을 골라서 리스트에 넣기
    for aa in temp_list_1 :
        df[aa].fillna(df[aa].median(), inplace=True)  # 중앙값으로 채워 넣기
    # df.info(verbose=True, show_counts=True)

    print("Null :", df.isnull().sum().sum() )

    return df

In [5]:
train = preprocess(train)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,0,0,0,0,0,0,8933,1565,0,4259,...,12,1269,1269,1269,12,12,12,12,12,12


'지금 5569489 개의 null값이 있습니다.'

Null : 0


In [6]:
test = preprocess(test)

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,0,0,0,0,0,8654,3002,0,4547,0,...,3,6031,6031,6031,3,3,3,3,3,3


'지금 1064235 개의 null값이 있습니다.'

Null : 0


In [7]:
x_train = train.drop('isFraud',axis=1)
y_train = train['isFraud']

In [8]:
from sklearn.model_selection import train_test_split
X_train, x_val, Y_train, y_val = train_test_split(x_train,y_train, test_size=0.2, random_state=0xC0FFEE)  # 42, 0xC0FFEE

In [9]:
from sklearn.metrics import accuracy_score, precision_score,recall_score,confusion_matrix

def get_clf_eval(y_test,pred):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision  = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    print('정확도:{0:.4f}, 정밀도:{1:.4f}, 재현율:{2:.4f}'.format(accuracy,precision,recall))

In [10]:
def get_model_train_eval(model,ftr_train=None,ftr_test=None,tgt_train=None,tgt_test=None):
    model.fit(ftr_train,tgt_train)
    pred = model.predict(ftr_test)
    pred_proba = model.predict_proba(ftr_test)[:,1]
    get_clf_eval(tgt_test,pred)

In [11]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=1000,num_leaves=64,n_jobs=-1,boost_from_average=False)
get_model_train_eval(lgbm_clf,ftr_train=X_train,ftr_test=x_val,tgt_train=Y_train,tgt_test=y_val)

오차 행렬
[[113693    142]
 [  1590   2683]]
정확도:0.9853, 정밀도:0.9497, 재현율:0.6279


# over sampling

In [12]:
from imblearn.over_sampling import SMOTE

def smote(x,y):
    smote = SMOTE(random_state=0)
    x_over ,y_over = smote.fit_resample(x,y)
    print('smote 적용 후 레이블 값 분포:\n',pd.Series(y_over).value_counts())
    return x_over, y_over

In [13]:
x_over ,y_over = smote(x_train,y_train)

smote 적용 후 레이블 값 분포:
 0    569877
1    569877
Name: isFraud, dtype: int64


In [14]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=1000,num_leaves=64,n_jobs=-1,boost_from_average=False)
get_model_train_eval(lgbm_clf,ftr_train=x_over,ftr_test=x_val,tgt_train=y_over,tgt_test=y_val)

오차 행렬
[[113801     34]
 [   734   3539]]
정확도:0.9935, 정밀도:0.9905, 재현율:0.8282


# feature engineering

In [15]:
#TransactionDT -> DaysFromStart, D1-DaysFromStart 만들기
import datetime
start_date = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
for df in [train,test]:
    #다른 항목을 만들기 위해 필요한 항목들
    df['DaysFromStart'] = np.floor(df['TransactionDT']/(60*60*24)) - 1
    df['D1-DaysFromStart'] = df['D1'] - df['DaysFromStart']

In [16]:
#uid 생성
for df in [train,test]:
    df['uid'] = df['ProductCD'].astype(str) + '_' + df['card1'].astype(str) + '_' + df['card2'].astype(str)
    df['uid'] = df['uid'] + '_' + df['card3'].astype(str) + '_' + df['card4'].astype(str)
    df['uid'] = df['uid'] + '_' + df['card5'].astype(str) + '_' + df['card6'].astype(str)
    df['uid'] = df['uid'] + '_' + df['addr1'].astype(str) + '_' + df['D1-DaysFromStart'].astype(str)

In [17]:
#transactionAmt
# 클립하여 0부터 5000사이의 값으로 봅니다
train['TransactionAmt'] = train['TransactionAmt'].clip(0,5000)
test['TransactionAmt']  = test['TransactionAmt'].clip(0,5000)

# 거래 액수가 일반적인지 아닌지를 봅니다
train['TransactionAmt_check'] = np.where(train['TransactionAmt'].isin(test['TransactionAmt']), 1, 0)
test['TransactionAmt_check']  = np.where(test['TransactionAmt'].isin(train['TransactionAmt']), 1, 0)

print('train')
print(train.TransactionAmt_check.value_counts())
print(' ')
print('test')
print(test.TransactionAmt_check.value_counts())

train
1    527684
0     62856
Name: TransactionAmt_check, dtype: int64
 
test
1    490910
0     15781
Name: TransactionAmt_check, dtype: int64


In [18]:
#다른 유용한 항목들

for df in [train, test]:
    df['ProductCD_card1'] = df['ProductCD'].astype(str) + '_' + df['card1'].astype(str)
    df['card1_addr1'] = df['card1'].astype(str) + '_' + df['addr1'].astype(str)
    df['card3_card5'] = df['card3'].astype(str) + '_' + df['card5'].astype(str)
    df['ProductCD_TransactionAmt'] = df['ProductCD'].astype(str) + '_' + df['TransactionAmt'].astype(str)
    df['cents'] = np.round(df['TransactionAmt'] - np.floor(df['TransactionAmt']), 3)
    df['ProductCD_cents'] = df['ProductCD'].astype(str) + '_' + df['cents'].astype(str)
    df['TransactionAmt'] = np.log1p(df['TransactionAmt'])

In [19]:
# 아래 항목들은 다른 것을 만드는데만 유용하니 이제 없애도록 하겠습니다
train = train.drop(['DaysFromStart','D1-DaysFromStart'], axis=1)
test = test.drop(['DaysFromStart','D1-DaysFromStart'], axis=1)

In [20]:
def freq_encode_full(df1, df2, col, normalize=True):
    df = pd.concat([df1[col], df2[col]])
    freq_dict = df.value_counts(dropna=False, normalize=normalize).to_dict()
    col_name = col + '_freq_enc_full'
    return col_name, freq_dict

In [21]:
i_cols = ['ProductCD_TransactionAmt', 'ProductCD_cents', 'cents', 'card1', 'card2', 'card3', 'card5', 'ProductCD_card1','card1_addr1']

for col in i_cols:
    col_name, freq_dict = freq_encode_full(train, test, col)
    train[col_name] = train[col].map(freq_dict).astype('float32')
    test[col_name] = test[col].map(freq_dict).astype('float32')

In [22]:
#label encoding
from sklearn import preprocessing
def label_encoding(df):
    for f in df.columns:
        if df[f].dtype == 'object':
            le = preprocessing.LabelEncoder()
            le.fit(list(df[f].values))
            df[f] = le.transform(list(df[f].values))
        else :
            pass

In [23]:
label_encoding(train)
label_encoding(test)

In [33]:
# v's pca 
from sklearn.preprocessing import StandardScaler

v_train = train.loc[:,'V12':'V321']
v_train = v_train.fillna(-999)
v_train = StandardScaler().fit_transform(v_train)
v_test = test.loc[:,'V12':'V321']
v_test = v_test.fillna(-999)
v_test = StandardScaler().fit_transform(v_test)


columns = ['PCA_V1','PCA_V2','PCA_V3','PCA_V4','PCA_V5','PCA_V6','PCA_V7','PCA_V8','PCA_V9','PCA_V10','PCA_V11','PCA_V12','PCA_V13','PCA_V14','PCA_V15','PCA_V16','PCA_V17','PCA_V18','PCA_V19','PCA_V20','PCA_V21','PCA_V22','PCA_V23','PCA_V24','PCA_V25','PCA_V26','PCA_V27','PCA_V28','PCA_V29','PCA_V30']
from sklearn.decomposition import PCA

pca = PCA(n_components=30)
principalComponents_train = pca.fit_transform(v_train)
principalDF_train = pd.DataFrame(data=principalComponents_train,columns=columns)
principalComponents_test = pca.fit_transform(v_test)
principalDF_test = pd.DataFrame(data=principalComponents_test,columns=columns)

train = train.drop(train.loc[:,'V12':'V321'].columns,axis=1)
test = test.drop(test.loc[:,'V12':'V321'].columns,axis=1)

train = pd.concat([train,principalDF_train],axis=1)
test = pd.concat([test,principalDF_test],axis=1)

KeyError: 'V12'

In [25]:
x_train = train.drop('isFraud',axis=1)
y_train = train['isFraud']

In [26]:
from sklearn.model_selection import train_test_split
X_train, x_val, Y_train, y_val = train_test_split(x_train,y_train, test_size=0.2, random_state=0xC0FFEE)  # 42, 0xC0FFEE

In [27]:
lgbm_clf = LGBMClassifier(n_estimators=1000,num_leaves=64,n_jobs=-1,boost_from_average=False)
get_model_train_eval(lgbm_clf,ftr_train=x_train,ftr_test=x_val,tgt_train=y_train,tgt_test=y_val)

오차 행렬
[[113826      9]
 [   435   3838]]
정확도:0.9962, 정밀도:0.9977, 재현율:0.8982


# CV 세트 기반의 스태킹

In [57]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

knn_clf  = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)


In [58]:
x_train = train.drop('isFraud',axis=1)
y_train = train['isFraud']

In [83]:
x_train.reset_index(drop=True)
test.reset_index(drop=True)

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,PCA_V21,PCA_V22,PCA_V23,PCA_V24,PCA_V25,PCA_V26,PCA_V27,PCA_V28,PCA_V29,PCA_V30
0,3663549,18403224,3.494991,4,10409,111.0,150.0,3,226.0,2,...,-2.541706,-4.413899,-1.368068,2.588336,2.059592,-2.214132,0.375738,-1.243150,0.073273,0.130684
1,3663550,18403263,3.912023,4,4272,111.0,150.0,3,226.0,2,...,-0.462993,-0.047533,0.213201,0.321371,0.175268,-0.463016,0.263122,-0.267762,-0.424245,0.017900
2,3663551,18403310,5.147494,4,4476,574.0,150.0,3,226.0,2,...,0.367833,-0.275383,-0.998033,1.839403,0.740493,-1.230587,0.501257,-0.640235,0.628991,0.099645
3,3663552,18403310,5.655817,4,10989,360.0,150.0,3,166.0,2,...,0.519898,0.236236,-0.340207,0.090790,-0.202748,-0.111730,0.368394,0.048229,-0.490259,-0.003657
4,3663553,18403317,4.233382,4,18018,452.0,150.0,2,117.0,2,...,1.951943,0.170897,0.130699,-0.294763,-0.740462,1.061017,-0.559347,0.526107,1.678918,0.005227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506686,4170235,34214279,4.560999,0,13832,375.0,185.0,2,224.0,2,...,-3.174117,-0.387888,0.366156,-0.061504,-0.057962,0.135373,-0.057691,-0.029002,0.046685,0.035598
506687,4170236,34214287,2.578169,0,3154,408.0,185.0,2,224.0,2,...,-0.107834,-0.391294,-0.627349,-0.315318,1.121349,-1.063174,-1.208455,-0.898189,0.611823,-0.167170
506688,4170237,34214326,3.912023,4,16661,490.0,150.0,3,226.0,2,...,0.417915,0.134416,0.028321,0.029680,0.141157,-0.068056,0.001012,0.016266,0.100591,0.013396
506689,4170238,34214337,5.313206,4,16621,516.0,150.0,2,224.0,2,...,0.417915,0.134416,0.028321,0.029680,0.141157,-0.068056,0.001012,0.016266,0.100591,0.013396


In [84]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# 개별 기반 모델에서 최종 메타 모델이 사용할 학습 및 테스트용 데이터를 생성하기 위한 함수. 
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds ):
    # 지정된 n_folds값으로 KFold 생성.
    kf = KFold(n_splits=n_folds, shuffle=False)
    #추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화 
    train_fold_pred = np.zeros((X_train_n.shape[0] ,1 ))
    test_pred = np.zeros((X_test_n.shape[0],n_folds))
    print(model.__class__.__name__ , ' model 시작 ')
    
    for folder_counter , (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        #입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 셋 추출 
        print('\t 폴드 세트: ',folder_counter,' 시작 ')
        
        X_tr = X_train_n[train_index] 
        y_tr = y_train_n[train_index] 
        X_te = X_train_n[valid_index]  
        
        #폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행.
        model.fit(X_tr , y_tr)       
        #폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장.
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        #입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장. 
        test_pred[:, folder_counter] = model.predict(X_test_n)
            
    # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성 
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)    
    
    #train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
    return train_fold_pred , test_pred_mean

In [85]:
knn_train, knn_test = get_stacking_base_datasets(knn_clf, x_train, y_train, test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, x_train, y_train, test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, x_train, y_train, test,  7)    
ada_train, ada_test = get_stacking_base_datasets(ada_clf, x_train, y_train, test, 7)

KNeighborsClassifier  model 시작 
	 폴드 세트:  0  시작 


KeyError: "None of [Int64Index([ 84363,  84364,  84365,  84366,  84367,  84368,  84369,  84370,\n             84371,  84372,\n            ...\n            590530, 590531, 590532, 590533, 590534, 590535, 590536, 590537,\n            590538, 590539],\n           dtype='int64', length=506177)] are in the [columns]"

In [None]:
Stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
Stack_final_X_test = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis=1)
print('원본 학습 피처 데이터 Shape:',X_train.shape, '원본 테스트 피처 Shape:',X_test.shape)
print('스태킹 학습 피처 데이터 Shape:', Stack_final_X_train.shape,
      '스태킹 테스트 피처 데이터 Shape:',Stack_final_X_test.shape)

In [None]:
lr_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)

print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test, stack_final)))