In [None]:
import os, gc, sys, warnings, datetime
warnings.filterwarnings('ignore')
from tqdm import tqdm
from IPython.display import FileLink

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold
import xgboost as xgb

import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('seaborn')
import seaborn as sns
import random
seed = 10
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
import eli5

In [None]:
%%time
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv', index_col='TransactionID')


In [None]:
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

del train_transaction, test_transaction, train_identity, test_identity

print ('data length after merging')
print(train.shape)
print(test.shape)

gc.collect()

## Initial setup

In [None]:
col_target = 'isFraud'

plot = True

drop_cols = list()
count_encode_list = list()
one_hot_encode_list = list()
target_encode_list = list()

### Helper function

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def Num_EDA(data, column):
    plt.figure(figsize=(12, 5))
    plt.subplot(121)
    sns.distplot(data[(~data[column].isna())&(train[col_target] == 0)][column], label='negative')
    sns.distplot(data[(~data[column].isna())&(train[col_target] == 1)][column], label='positive')
    plt.title(column + ' distplot')
    plt.legend()
    
    plt.subplot(122)
    data.groupby(col_target)[column].mean().plot('bar')
    plt.title(column + ' average')
    plt.axhline(data[column].mean(), color='r', linestyle='-.')

    plt.tight_layout()
    plt.show()

# choose sort value 30
def Cat_EDA(data, column, n_show=30):
    plt.figure(figsize=(12, 5))
    plt.subplot(121)
    temp_data = data[column].value_counts().sort_values(ascending=False).iloc[:n_show]
    temp_data.sort_index().plot('bar')
    plt.title('Number of %s in data'%column)
    
    plt.subplot(122)
    temp_data = data.groupby(column)[col_target].mean().sort_values(ascending=False).iloc[:n_show]
    temp_data.sort_index().plot(kind='bar')
    plt.axhline(data[col_target].mean(), color='r', linestyle='-.')
    plt.title('Average fraud percentage per %s'%column)

    plt.tight_layout()
    plt.show()

def distribution(column, method='original'):
    
    train_col = train[~train[column].isna()][column]
    test_col = test[~test[column].isna()][column]
    
    plt.figure(figsize=(12, 4))
    if method == 'original':
        sns.distplot(train_col, label='train')
        sns.distplot(test_col, label='test')
        
    elif method == 'log1p':
        sns.distplot(np.log1p(train_col), label='train')
        sns.distplot(np.log1p(test_col), label='test')
        
    else:
        print ('Wrong method')
        pass
        
    plt.legend()
    plt.title('%s distribution in train and test'%column)
    plt.show()

    plt.figure(figsize=(12, 4))
    if method == 'original':
        sns.distplot(train_col[train[col_target] == 1], label='positive')
        sns.distplot(train_col[train[col_target] == 0], label='negative')
        
    elif method == 'log1p':
        sns.distplot(np.log1p(train_col[train[col_target] == 1]), label='positive')
        sns.distplot(np.log1p(train_col[train[col_target] == 0]), label='negative')
        
    else:
        print ('Wrong method')
        pass
        
    plt.legend()
    plt.title('%s distribution in positive and negative'%column)
    plt.show()


def distribution(column, method='original'):
    
    train_col = train[~train[column].isna()][column]
    test_col = test[~test[column].isna()][column]
    
    plt.figure(figsize=(12, 4))
    if method == 'original':
        sns.distplot(train_col, label='train')
        sns.distplot(test_col, label='test')
        
    elif method == 'log1p':
        sns.distplot(np.log1p(train_col), label='train')
        sns.distplot(np.log1p(test_col), label='test')
        
    else:
        print ('Wrong method')
        pass
        
    plt.legend()
    plt.title('%s distribution in train and test'%column)
    plt.show()

    plt.figure(figsize=(12, 4))
    if method == 'original':
        sns.distplot(train_col[train[col_target] == 1], label='positive')
        sns.distplot(train_col[train[col_target] == 0], label='negative')
        
    elif method == 'log1p':
        sns.distplot(np.log1p(train_col[train[col_target] == 1]), label='positive')
        sns.distplot(np.log1p(train_col[train[col_target] == 0]), label='negative')
        
    else:
        print ('Wrong method')
        pass
        
    plt.legend()
    plt.title('%s distribution in positive and negative'%column)
    plt.show()


def make_count_full(col, dropna_encode=False):
    temp_data = pd.concat([train[col], test[col]], ignore_index=True)
    
    train['%s_count_full'%col] = train[col].map(temp_data.value_counts(dropna=dropna_encode))
    test['%s_count_full'%col] = test[col].map(temp_data.value_counts(dropna=dropna_encode))

## datetime : TransactionDT

In [None]:
def make_datetime_features(df):
    from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
    start_date = pd.to_datetime('2017-11-30')
    date_range = pd.date_range('2017-01-01', '2019-01-01')
    us_holidays = calendar().holidays(start=date_range.min(), end=date_range.max())

    # add some time variables
    df['DT'] = df['TransactionDT'].apply(lambda x:(start_date + datetime.timedelta(seconds=x)))
    
    df['DT_M'] = ((df['DT'].dt.year-2017)*12 + df['DT'].dt.month).astype(np.int8)
    df['DT_W'] = ((df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear).astype(np.int8)
    df['DT_D'] = ((df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear).astype(np.int16)
    
    df['DT_day_week'] = df['DT'].dt.dayofweek.astype(np.int8)
    df['DT_day_month'] = df['DT'].dt.day.astype(np.int8)
    df['DT_Hour'] = df['DT'].dt.hour.astype(np.int8)
    
    df['Month'] = df['DT'].dt.month.astype(np.int8)
    df['early_morning'] = np.array([(df['DT_Hour'] >=0)&(df['DT_Hour'] <=5)]).astype(int)[0]
    df['morning'] = np.array([(df['DT_Hour'] >=6)&(df['DT_Hour'] <=11)]).astype(int)[0]
    df['afternoon'] = np.array([(df['DT_Hour'] >=12)&(df['DT_Hour'] <=17)]).astype(int)[0]
    df['evening'] = np.array([(df['DT_Hour'] >=18)&(df['DT_Hour'] <=23)]).astype(int)[0]
    
    df['is_holiday'] = df['DT'].dt.date.astype('datetime64').isin(us_holidays).astype(np.int8)
    return df

for df in [train, test]:
    df = make_datetime_features(df)
    
# Total transactions per timeblock
for col in ['DT_M','DT_W','DT_D']:
    temp_df = pd.concat([train[[col]], test[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()
            
    train[col+'_total'] = train[col].map(fq_encode)
    test[col+'_total']  = test[col].map(fq_encode)

drop_cols.append('DT')
del temp_df

if plot:
    train.groupby('DT_M')[col_target].mean().plot('bar')
    plt.title('Fraud proportion through months')
    plt.show()

    train.groupby(['DT_Hour'])[col_target].mean().plot(figsize=(15, 6))
    for i in [5, 11, 17]:
        plt.axvline(i, color='r')
    plt.show()


In [None]:
# average fraud rate per hour per month

if plot:
    t = train.groupby(['DT_M', 'DT_Hour'])[col_target].mean()
    t.plot(figsize=(15, 6))
    # plt.figure(figsize=(15, 6))
    # plt.plot(range(len(t)), t.values)
    plt.title('Average fraud rate per hour per month')

    for i in range(1, int(len(t)/24)):
        plt.axvline(i*24, color='r', linestyle='--')

    plt.show()

In [None]:
if plot:
    cols = ['TransactionAmt']
    periods = ['DT_M', 'DT_D']

    temp_df = pd.concat([train[cols+periods], test[cols+periods]], axis=0)

    for period in periods:
        for col in cols:
            plt.figure(figsize=(12, 4))

            if period == 'DT_D':
                temp_df[temp_df[col]<5000].set_index(period)[col].plot(style='.', title=col)
                temp_df[temp_df[col]>5000].set_index(period)[col].plot(style='.', title=col, color='r')
                plt.axhline(5000, ls='--', c='r')

            else:
                temp_df.groupby(period)[col].mean().plot(kind='bar', title=col)

            plt.show()

    print ('number of train data which exceed 5000:', len(train[train['TransactionAmt'].sort_values(ascending=False)>5000]))
    print ('number of test data which exceed 5000:', len(test[test['TransactionAmt'].sort_values(ascending=False)>5000]))

    del temp_df

In [None]:
if plot:
    # TransactionAmt distribution
    distribution('TransactionAmt', method='log1p')

    # TransactionAmt value compares the closest number (abs(1.7 - 2)).
    plt.figure(figsize=(12, 4))
    sns.distplot(abs(train['TransactionAmt'] - np.round(train['TransactionAmt'])), label='train')
    sns.distplot(abs(test['TransactionAmt'] - np.round(test['TransactionAmt'])), label='test')
    plt.legend()
    plt.show()

    plt.figure(figsize=(12, 4))
    sns.distplot(abs(train[train[col_target] == 1]\
        ['TransactionAmt'] - np.round(train[train[col_target] == 1]['TransactionAmt'])), label='positive')
    sns.distplot(abs(train[train[col_target] == 0]\
        ['TransactionAmt'] - np.round(train[train[col_target] == 0]['TransactionAmt'])), label='negative')
    plt.legend()
    plt.show()

    # decimal part
    plt.figure(figsize=(12, 4))
    sns.distplot(train['TransactionAmt'] - train['TransactionAmt'].astype(int), label='train')
    sns.distplot(test['TransactionAmt'] - test['TransactionAmt'].astype(int), label='test')
    plt.legend()
    plt.title('TransactionAmt decimal distribution in train and test')
    plt.show()

    plt.figure(figsize=(12, 4))
    sns.distplot(1000*abs(train[train[col_target] == 1]['TransactionAmt'] - train[train[col_target] == 1]\
                                                                                     ['TransactionAmt'].astype(int)), label='positive')
    sns.distplot(1000*abs(train[train[col_target] == 0]['TransactionAmt'] - train[train[col_target] == 0]\
                                                                                     ['TransactionAmt'].astype(int)), label='negative')
    plt.legend()
    plt.title('TransactionAmt decimal distribution in positive and negative data')
    plt.show()

## money : TransactionAmt

In [None]:
if plot:
    cols = ['TransactionAmt']
    periods = ['DT_M', 'DT_D', 'DT_Hour']

    temp_df = pd.concat([train[cols+periods], test[cols+periods]], axis=0)

    for period in periods:
        for col in cols:
            if period == 'DT_D':
                plt.figure(figsize=(12, 4))
                temp_df[temp_df[col]<5000].set_index(period)[col].plot(style='.', title=col)
                temp_df[temp_df[col]>5000].set_index(period)[col].plot(style='.', title=col, color='r')
                plt.axhline(5000, ls='--', c='r')
                plt.title('%s Scatter plot and outlier'%period)
                plt.show()

            else:
                temp_1 = train.groupby(period)[col].mean()
                temp_2 = train.groupby(period)[col_target].mean()

                plt.figure(figsize=(12, 4))
                temp_1.plot(kind='bar')
                (1000*temp_2).plot()
                plt.title('%s Mean and fraud rate, coef %s'%(period, np.corrcoef(temp_1, temp_2)[0, 1]))

                plt.show()

                temp_1 = train.groupby(period)[col].std()
                temp_2 = train.groupby(period)[col_target].mean()

                plt.figure(figsize=(12, 4))
                temp_1.plot(kind='bar')
                (1000*temp_2).plot()
                plt.title('%s Std and fraud rate, coef %s'%(period, np.corrcoef(temp_1, temp_2)[0, 1]))

                plt.show()


            plt.show()

    print ('number of train data which exceed 5000:', len(train[train['TransactionAmt'].sort_values(ascending=False)>5000]))
    print ('number of test data which exceed 5000:', len(test[test['TransactionAmt'].sort_values(ascending=False)>5000]))

    del temp_df, temp_1, temp_2

In [None]:
# # remove ouliar
# train[cols] = train[cols].clip(0, 5000)
# test[cols] = test[cols].clip(0, 5000)

for df in [train, test]:
    df['TransactionAmt_charge'] = abs(df['TransactionAmt'] - np.round(df['TransactionAmt']))
    df['TransactionAmt_decimal'] = 1000*(df['TransactionAmt'] - df['TransactionAmt'].astype(int))

    # foreign currency transforming makes the two decimal places
    df['TransactionAmt_decimal'] = df['TransactionAmt'].apply(lambda x:str('%.3f'%x)[-1])

# TransactionAmt per time

for t in ['DT_M', 'DT_Hour', 'card1','card2','card3', 'card5','addr1']:
    temp_df = pd.concat([train[[t, 'TransactionAmt']], test[[t, 'TransactionAmt']]], axis=0)
    for df in [train, test]:
        df['VAL_MEAN_TransactionAmt_BY_%s'%t] = df[t].map(temp_df.groupby(t)['TransactionAmt'].mean())
        df['VAL_STD_TransactionAmt_BY_%s'%t] = df[t].map(temp_df.groupby(t)['TransactionAmt'].std())
    del temp_df
    
gc.collect()

## product code : ProductCD

In [None]:
if plot:
    Cat_EDA(train, 'ProductCD')

train['is_code_C'] = sum([train['ProductCD'] == 'C'])
test['is_code_C'] = sum([test['ProductCD'] == 'C'])

for product_col in ['ProductCD']:
    make_count_full(product_col)
    if plot:
        distribution('%s_count_full'%product_col)


## card : card1~6

In [None]:
if plot:
    # few unique EDA
    for col in ['card4', 'card6']:
        Cat_EDA(train, col)

train['is_card4_discover'] = sum([train['card4'] == 'discover'])
test['is_card4_discover'] = sum([test['card4'] == 'discover'])

train['is_card6_discover'] = sum([train['card6'] == 'credit'])
test['is_card6_discover'] = sum([test['card6'] == 'credit'])


In [None]:
if plot:
    for i, m in zip(['card1', 'card2', 'card3', 'card5'], ['original', 'original', 'log1p', 'original']):
        distribution(i, method=m)


In [None]:
# Count encoding for some card feature. 
# Explained in this kernel: https://www.kaggle.com/nroman/eda-for-cis-fraud-detection

# card3 doesn't show any clue in eda
for card_col in ['card1', 'card2', 'card5']:
    make_count_full(card_col)
    if plot:
        distribution('%s_count_full'%card_col)

count_encode_list.extend(['card1', 'card2', 'card3', 'card5'])

## addr : addr1~2

In [None]:
for col in ['addr1', 'addr2']: 
    print ('nunique in train', col, train[col].nunique())
    print ('nunique in test', col, test[col].nunique())
    
    print('No intersection in Train', col, len(train[~train[col].isin(test[col])]))
    print('Intersection in Train', col, len(train[train[col].isin(test[col])]))
    
    if plot:
        Cat_EDA(train, col, 30)
        distribution(col)
    
    print('#'*20)
    

In [None]:
# na的mapping可以再考慮要不要讓他自動補值
# Count encoding for addr feature. 
# Explained in this kernel: https://www.kaggle.com/nroman/eda-for-cis-fraud-detection

for addr_col in ['addr1', 'addr2']:
    make_count_full(addr_col)
    if plot:
        distribution('%s_count_full'%addr_col)
    
count_encode_list.extend(['addr1', 'addr2'])

In [None]:
def make_uid(data):
    # use card1, card2, card5, addr1, P_emaildomain
    former_cols = data.columns
    
    # 2*3-3-1 = 4
    data['uid_card1_2'] = data['card1'].astype(str) + data['card2'].astype(str)
    data['uid_card1_5'] = data['card1'].astype(str) + data['card5'].astype(str)
    data['uid_card1_2_5'] = data['uid_card1_2'] + data['card5'].astype(str)
    
    data['uid_card2_5'] = data['card2'].astype(str) + data['card5'].astype(str)
    
    # 2*4-4-1-(4) = 7
    data['uid_card1_addr1'] = data['card1'].astype(str) + data['addr1'].astype(str)
    data['uid_card2_addr1'] = data['card2'].astype(str) + data['addr1'].astype(str)
    data['uid_card5_addr1'] = data['card5'].astype(str) + data['addr1'].astype(str)
    
    data['uid_card1_2_addr1'] = data['uid_card1_2'] + data['addr1'].astype(str)
    data['uid_card1_5_addr1'] = data['uid_card1_5'] + data['addr1'].astype(str)
    data['uid_card2_5_addr1'] = data['uid_card2_5'] + data['addr1'].astype(str)
    
    data['uid_card1_2_5_addr1'] = data['uid_card1_2_5'] + data['addr1'].astype(str)
    
    # 2*5-5-1-(4)-(7) = 15(X)
    data['uid_card1_P'] = data['card1'].astype(str) + data['P_emaildomain'].astype(str) # week after count encode
    data['uid_card2_P'] = data['card2'].astype(str) + data['P_emaildomain'].astype(str)
    data['uid_card1_2_P'] = data['uid_card1_2'].astype(str) + data['P_emaildomain'].astype(str) # week after count encode
    
    after_cols = data.columns
    
    new_cols = after_cols.difference(former_cols)
    return data, new_cols

train, new_cols = make_uid(train)
test, _ = make_uid(test)

gc.collect()

In [None]:
# na的mapping可以再考慮要不要讓他自動補值
# Count encoding for uid feature. 
# Explained in this kernel: https://www.kaggle.com/nroman/eda-for-cis-fraud-detection

# 'uid_card1_P', 'uid_card1_2_P'需要被確認

uid_cols = ['uid_card1_2', 'uid_card1_5', 'uid_card1_2_5', 'uid_card2_5', 'uid_card1_addr1', 'uid_card2_addr1', 'uid_card5_addr1', 'uid_card1_2_addr1',\
          'uid_card1_5_addr1', 'uid_card2_5_addr1', 'uid_card1_2_5_addr1', 'uid_card2_P', 'uid_card1_P', 'uid_card1_2_P']

for uid_col in uid_cols:
    make_count_full(uid_col)
    if plot:
        distribution('%s_count_full'%uid_col)
    

In [None]:
%%time
def make_card_features(data, target_cols, groupby_cols):
    
    for groupby_col in groupby_cols:
        for target_col in target_cols:
            data['VAL_MEAN_%s_BY_%s'%(target_col, groupby_col)] = data[target_col] / data.groupby([groupby_col])[target_col].transform('mean')
            data['VAL_STD_%s_BY_%s'%(target_col, groupby_col)] = data[target_col] / data.groupby([groupby_col])[target_col].transform('std')
            data['VAL_STD_%s_BY_%s'%(target_col, groupby_col)].loc[np.isinf(data['VAL_STD_%s_BY_%s'%(target_col, groupby_col)])] = np.nan
            
# TransactionAmt has bean done
groupby_cols = ['card1', 'card2', 'card3', 'card4', 'card5']
target_cols = ['D15', 'id_02']

for df in [train, test]:
    make_card_features(df, target_cols, groupby_cols)
    
del df, groupby_cols, target_cols
gc.collect()


## dist : dist1~2

In [None]:
if plot:
    for col in ['dist1', 'dist2']:
        distribution(col, method='log1p')
        Num_EDA(train, col)

    #     print (col, '-'*30)
    #     print (train[col].value_counts(dropna=False))

## emaildomain : P_emaildomain, R_emaildomain

In [None]:
# email information features

def email_domain(df):
    df["P_emaildomain"] = df["P_emaildomain"].fillna('Unknown')
    df["P_country_emaildomain"] = df["P_emaildomain"].str.extract('(\.de)|(\.ds)|(\.mx)|(\.es)|(\.fr)|(\.uk)|(\.jp)|(Unknown)').stack().droplevel(1).apply(lambda x:x.replace('.', ''))
    df['P_type_emaildomain'] = df["P_emaildomain"].str.split('.').apply(lambda x:np.nan if x == 'Unknown' else x[0])
    df["P_domain_emaildomain"] = df["P_emaildomain"].str.extract('(\.com)|(\.net)|(\.co)|(Unknown)').stack().droplevel(1).apply(lambda x:x.replace('.', ''))
    df["P_len_emaildomain"] = df["P_emaildomain"].str.split(".").apply(lambda x: 0 if x == 'Unknown' else len(x))
    
    df["R_emaildomain"] = df["R_emaildomain"].fillna('Unknown')
    df["R_country_emaildomain"] = df["R_emaildomain"].str.extract('(\.de)|(\.ds)|(\.mx)|(\.es)|(\.fr)|(\.uk)|(\.jp)|(Unknown)').stack().droplevel(1).apply(lambda x:x.replace('.', ''))
    df['R_type_emaildomain'] = df["R_emaildomain"].str.split('.').apply(lambda x:np.nan if x == 'Unknown' else x[0])
    df["R_domain_emaildomain"] = df["R_emaildomain"].str.extract('(\.com)|(\.net)|(\.co)|(\.Unknown)').stack().droplevel(1).apply(lambda x:x.replace('.', ''))
    df["R_len_emaildomain"] = df["R_emaildomain"].str.split(".").apply(lambda x: 0 if x == 'Unknown' else len(x))
    
    df["same_eamildomain"] = (df["P_emaildomain"] == df["R_emaildomain"]).astype(int)
    df['is_es'] = (df['P_country_emaildomain'] == 'es').astype(int)
        
for df in [train, test]:
    df = email_domain(df)

if plot:
    for col in ['P_country_emaildomain', 'P_type_emaildomain', 'P_domain_emaildomain', 'P_len_emaildomain', 'same_eamildomain']:
        Cat_EDA(train, col)
    
one_hot_encode_list.extend(['P_country_emaildomain', 'P_domain_emaildomain'])
target_encode_list.extend(['P_type_emaildomain'])


In [None]:
if plot:
    # check train and test data distribution
    plt.figure(figsize=(12, 5))
    v = train['P_type_emaildomain'].value_counts(normalize=True)
    v.plot('bar')
    test['P_type_emaildomain'].value_counts(normalize=True).loc[v.index].plot(color='r')
    plt.xticks(rotation=90)
    plt.show()


## C : C1~14

In [None]:
for df in [train, test]:
    df['C_mean'] = df[['C%s'%i for i in range(1, 15)]].mean(axis=1)
    df['C_std'] = df[['C%s'%i for i in range(1, 15)]].std(axis=1)
    
if plot:
    for col in ['C%s'%i for i in range(1, 15)] + ['C_mean', 'C_std']:
        distribution(col, method='log1p')
        
for c_col in ['C%s'%i for i in range(1, 15)]:
    make_count_full(c_col)
    if plot:
        distribution('%s_count_full'%c_col)
    

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

n_pc = 4
C_pca_encoder = Pipeline([('SimpleImputer',SimpleImputer()),
                         ('normalize',StandardScaler()),
                         ('pca',PCA(n_components=n_pc))]).fit(train[['C'+str(i+1) for i in range(14)]])

def C_pca(df):
    temp = pd.DataFrame(C_pca_encoder.transform(df[['C'+str(i+1) for i in range(14)]]), \
                        index=df.index,\
                        columns=['C_pca'+str(i) for i in range(n_pc)])
    df = pd.concat([df, temp], axis=1)
    return df
    
train = C_pca(train)
test = C_pca(test)

In [None]:
%%time
col1 = ['card1','card2','card5','addr1']
col2 = ['C%s'%i for i in range(1, 15)]

for col_1 in col1:
    for col_2 in col2:
        temp_df = pd.concat([train[[col_1, col_2]], test[[col_1, col_2]]], ignore_index=True)
        col_count0 = temp_df[temp_df[col_2] == 0].groupby(col_1)[col_2].count()
        col_count1 = temp_df[temp_df[col_2] != 0].groupby(col_1)[col_2].count()
        
        for df in [train, test]:
            df['VAL_RATIO_zero_nonzero_%s_BY_%s'%(col_2, col_1)] = df[col_1].map(col_count1) / (df[col_1].map(col_count0) + 0.01)

        del temp_df
        gc.collect()
        

## D : D1~15

In [None]:
for df in [train, test]:
    df['D_isna_num'] = df[['D%s'%i for i in range(1, 16)]].isna().sum(axis=1)
    df['D_mean'] = df[['D%s'%i for i in range(1, 16)]].mean(axis=1)
    
if plot:
    for col in ['D%s'%i for i in range(1, 16)]+['D_isna_num', 'D_mean']:
        distribution(col)

## M : M1~9

In [None]:
for df in [train, test]:
    df['M_isna_num'] = df[['M%s'%i for i in range(1, 10)]].isna().sum(axis=1)
    df[['M%s'%i for i in range(1, 10)]] = df[['M%s'%i for i in range(1, 10)]].fillna('Unknown')
    df['is_M2'] = (df['M4'] == 'M2').astype(int)

if plot:
    for col in ['M%s'%i for i in range(1, 10)]:
        Cat_EDA(train, col)
        
    distribution('M_isna_num')
    

## V : V1~339

In [None]:
### V : 1~11, 12~34, 35~52, 53~94, 95~106, 107~125, 126~130, 131~134, 135~137, 138~278, 279~305, 306~315, \
#    316~318, 319~321, 322~339

def v_variable(df):
    df['v_isna_num'] = df[['V%s'%i for i in range(1, 339)]].isna().sum(axis=1)
    
    for i, j in zip([1, 12, 35, 53, 95, 107, 126, 131, 135, 138, 279, 306, 316, 319, 322], \
                    [11, 34, 52, 94, 106, 125, 130, 134, 137, 278, 305, 315, 318, 321, 339]):
        df['v_%s_%s_isna_num'%(i, j)] = df[['V%s'%i for i in range(i, j+1)]].isna().sum(axis=1)
    
    df['V257_258_sum'] = df['V257'] + df['V258']
    df['V257_258_equal'] = sum([train['V257'].fillna(-999) != train['V258'].fillna(-999)])
    
for df in [train, test]:
    df = v_variable(df)

## id : id_01~38

In [None]:
def id_variable(df):
    df['id_isna_num'] = df[['id_%s'%i if len(str(i))==2 else 'id_0%s'%i for i in range(1, 39)]].isna().sum(axis=1)
    df['OS'] = df["id_30"].str.split(' ', expand = True)[[0]]
    df['good_browser'] = df['id_31'].astype(str).str.extract('(edge)|(google)|(ie)|(safari)|(nan)').any(axis = 'columns').astype(int)

for df in [train, test]:
    df = id_variable(df)

## Device : DeviceType, DeviceType

In [None]:
popular_device_list = (pd.concat([train['DeviceInfo'],test['DeviceInfo']]).value_counts().iloc[:15]).index.values
def device(df):
    df['popular_deviceinfo'] = df["DeviceInfo"].isin(popular_device_list).astype(int)
    df['good_deviceinfo'] = df["DeviceInfo"].astype(str).str.extract('(Trident)|(iOS)|(MacOS)|(Windows)|(rv:[^(5\d)(6\d)])|(nan)').any(axis = 'columns').astype(int)

for df in [train, test]:
    df[['DeviceInfo', 'DeviceType']] = df[['DeviceInfo', 'DeviceType']].fillna('Unknown')
    df = device(df)

## others

In [None]:
def same_tx_in_radius(df,radius = 5):
    temp = df[['TransactionAmt']].copy()
    for i in range(-radius,radius+1):
        temp['shift_'+str(i)] = temp['TransactionAmt'] == temp['TransactionAmt'].shift(i)
    df['same_TransactionAmt_in_radius'] = temp[['shift_'+str(i) for i in range(-radius,radius+1)]].sum(axis = 'columns')
    
for df in [train, test]:
    df = same_tx_in_radius(df)

In [None]:
# null number
train['nulls'] = train.isnull().sum(axis=1)
test['nulls'] = test.isnull().sum(axis=1)

if plot:
    distribution('nulls')

In [None]:
%%time

cols = ['card1', 'card2', 'card5', 'addr1', 'addr2', 'uid_card1_2', 'uid_card1_5', 'uid_card1_2_5', 'uid_card2_5',\
        'uid_card1_addr1', 'uid_card2_addr1', 'uid_card5_addr1', 'uid_card1_2_addr1', 'uid_card1_5_addr1',\
        'uid_card2_5_addr1', 'uid_card1_2_5_addr1', 'uid_card2_P', 'uid_card1_P', 'uid_card1_2_P']

def make_transactionAmt_groupby(columns):
    remain_cols = columns + ['ProductCD', 'TransactionAmt']
    all_transaction = pd.concat([train[remain_cols], test[remain_cols]], axis=0).reset_index()
    all_transaction[columns] = all_transaction[columns].astype(str)
    
    for col in columns:
        print (col, ' processing...')
        two_groupby = all_transaction.groupby(['ProductCD', col])['TransactionAmt'].mean().reset_index()
        two_groupby.rename({'TransactionAmt':'ProductCD_%s_TransactionAmt'%col}, axis=1, inplace=True)

        all_transaction = all_transaction.merge(two_groupby, on=['ProductCD', col])
        all_transaction['ProductCD_%s_TransactionAmt_MAPE'%col] = abs(all_transaction['TransactionAmt'] - \
            all_transaction['ProductCD_%s_TransactionAmt'%col])/all_transaction['ProductCD_%s_TransactionAmt'%col]
    
    return pd.concat([train, all_transaction.set_index('TransactionID').loc[train.index].drop(cols + ['ProductCD', 'TransactionAmt'], axis=1)], axis=1), \
        pd.concat([test, all_transaction.set_index('TransactionID').loc[test.index].drop(cols + ['ProductCD', 'TransactionAmt'], axis=1)], axis=1)
    

train, test = make_transactionAmt_groupby(cols)


In [None]:

# col_del = []
# for i in range(339):
#     col = "V" + str(i+1)
#     s = train[col].fillna(0).map(lambda x:0 if x%1 == 0 else 1).sum()
#     if s > 100:
#         print(col,s)
#         col_del.append(col)
# #         del test_transaction[col],train_transaction[col]



## Make train & test data and save memory

In [None]:
# Drop target
y_train = train['isFraud']
X_train = train.drop(drop_cols + ['isFraud'], axis=1)

X_test = test.drop(drop_cols, axis=1).copy()

del train
del test


# From kernel https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
# WARNING! THIS CAN DAMAGE THE DATA 

X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)


## Encoding

In [None]:
# One Hot Encoding
print ('one-hot encoding...')
for df in [X_train, X_test]:
    temp = pd.DataFrame({})
    for O in one_hot_encode_list:
        temp = pd.concat([temp, pd.get_dummies(df[O], prefix=O)], axis=1)
    
    df = pd.concat([df, temp], axis=1)
    df.drop(one_hot_encode_list, axis=1, inplace=True)

# Count Encoding
print ('count encoding...')
for C in count_encode_list:
    temp = X_train[C].value_counts()
    X_train[C] = X_train[C].map(temp)
    X_test[C] = X_test[C].map(temp)

# Target Encoding
print ('target encoding...')
for T in target_encode_list:
    temp = pd.concat([X_train, y_train], axis=1)
    temp = temp.groupby(T)[col_target].mean()
    
    X_train[T] = X_train[T].map(temp)
    X_test[T] = X_test[T].map(temp)

# Label Encoding
print ('label encoding...')
for L in set(list(X_train.select_dtypes('category').columns)+list(X_test.select_dtypes('category').columns)):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(np.array(list(X_train[L].values) + list(X_test[L].values)).astype(str))
    X_train[L] = lbl.transform(list(X_train[L].values.astype(str)))
    X_test[L] = lbl.transform(list(X_test[L].values.astype(str)))
        
del temp, C, T, L, lbl
gc.collect()

### drop columns from feature importance selection of LGBM in local test

In [None]:
# drop_cols_FI = []

drop_cols_FI = ['V122', 'V89', 'id_27', 'V325', 'V196', 'v_95_106_isna_num', 'V252', 'id_22', 'V297', 'V299', \
                'V191', 'V334', 'v_126_130_isna_num', 'V113', 'M1', 'V240', 'V138', 'V88', 'id_35', 'V104', \
                'V117', 'V302', 'V84', 'id_28', 'V119', 'V241', 'V8', 'V322', 'V330', 'V167', 'id_24', 'V50', \
                'V181', 'V305', 'V14', 'V1', 'V118', 'v_1_11_isna_num', 'v_53_94_isna_num', 'V247', \
                'V257_258_equal', 'v_131_134_isna_num', 'V27', 'V106', 'id_29', 'v_135_137_isna_num', 'V142', \
                'V116', 'V153', 'V9', \
#                 'V21', 'id_12', 'V110', 'V114', 'v_138_278_isna_num', 'V269', 'addr2_count_full', 'V121', \
#                 'is_es','addr2', 'V28', 'v_322_339_isna_num', 'V120', 'V107', 'v_306_315_isna_num', \
#                 'v_279_305_isna_num', 'V68', 'v_35_52_isna_num', 'v_12_34_isna_num', 'V41', 'is_code_C', \
#                 'v_316_318_isna_num', 'V65', 'v_319_321_isna_num', 'v_107_125_isna_num'
               ]

X_train.drop(drop_cols_FI, axis=1, inplace=True)
X_test.drop(drop_cols_FI, axis=1, inplace=True)

In [None]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

**Set debug = True to see validation metric**

In [None]:
debug = False
if debug:
    split_pos = X_train.shape[0]*4//5
    y_test = y_train.iloc[split_pos:]
    y_train = y_train.iloc[:split_pos]
    X_test = X_train.iloc[split_pos:,:]
    X_train = X_train.iloc[:split_pos,:]

## Xgboost

In [None]:
# %%time
# from sklearn.metrics import roc_auc_score
# folds = 6
# gkf = GroupKFold(n_splits = folds)
# split_groups = X_train['DT_M']

# xgb_y_preds = np.zeros(X_test.shape[0])
# count = 0
# for tr_idx, val_idx in gkf.split(X_train, y_train, groups=split_groups):
#     count+=1
#     if count != 1:
#         print ('fold %s ------------------------------------'%count)
        
#         clf = xgb.XGBClassifier(
#             n_estimators=10000,
#             max_depth=9,
#             learning_rate=0.01,
#             subsample=0.9,
#             colsample_bytree=0.9,
#             tree_method='gpu_hist'
#         )

#         X_tr = X_train.iloc[tr_idx, :]
#         y_tr = y_train.iloc[tr_idx]
        
#         X_val = X_train.iloc[val_idx, :]
#         y_val = y_train.iloc[val_idx]
        
#         clf.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_val, y_val)], early_stopping_rounds=50)
#         del X_tr, X_val
#         xgb_y_preds+= clf.predict_proba(X_test)[:,1] / folds
#         if debug:    
#             print("debug:",roc_auc_score(y_test, clf.predict_proba(X_test)[:,1] / folds)) 
#         del clf
#         gc.collect()
    

# if debug:    
#     print("debug:",roc_auc_score(y_test, xgb_y_preds))  

# gc.collect()

In [None]:
# if not debug:   
#     sample_submission['isFraud'] = xgb_y_preds
#     sample_submission.to_csv('xgb_y_preds.csv')

#     FileLink('xgb_y_preds.csv')

## Catboost

In [None]:
# import catboost as cb
# from catboost import CatBoostClassifier,Pool

# cate = one_hot_encode_list+count_encode_list+target_encode_list+list(set(list(X_train.select_dtypes('category').columns)+list(X_test.select_dtypes('category').columns)))

# print(cate)
# verbose_eval = 500
# num_rounds = 10000

# folds = 6
# gkf = GroupKFold(n_splits = folds)
# split_groups = X_train['DT_M']

# feature_importance_df = pd.DataFrame()

# catb_y_preds = np.zeros(X_test.shape[0])
# count = 0
# for tr_idx, val_idx in gkf.split(X_train, y_train, groups=split_groups):
#     count+=1
#     if count != 1:
#         print ('fold %s ------------------------------------'%count)
        
#         X_tr = X_train.iloc[tr_idx, :].fillna(-1)
#         y_tr = y_train.iloc[tr_idx]

#         model=cb.CatBoostClassifier(iterations=num_rounds,depth=14,learning_rate=0.04,loss_function='Logloss',eval_metric='AUC'
#                                     )#,task_type = "GPU"
#         if debug:
#             model.fit(X_tr,y_tr,cat_features=cate,verbose_eval = verbose_eval)
#         else:
#             model.fit(X_tr,y_tr,cat_features=cate,verbose_eval = verbose_eval)


#         del X_tr
#         catb_y_preds+= model.predict_proba(X_test.fillna(-1))[:,1] / folds


#         if debug:    
#             print("debug:",roc_auc_score(y_test, model.predict_proba(X_test.fillna(-1))[:,1] / folds))  
        
#         gc.collect()
            
# if debug:    
#     print("debug:",roc_auc_score(y_test, catb_y_preds))  

# gc.collect()

In [None]:
# if debug:    
#     print("debug:",roc_auc_score(y_test, y_preds))
#     print("debug:",roc_auc_score(y_test, y_preds2))
#     print("debug:",roc_auc_score(y_test, y_preds3))  
#     print("debug:",roc_auc_score(y_test, (y_preds + y_preds3)*0.5))  
#     print("debug:",roc_auc_score(y_test, (y_preds + y_preds2 + y_preds3*0.5)*0.33))
#     print("debug:",roc_auc_score(y_test, (y_preds11*0.5 + y_preds*0.5 + y_preds2 + y_preds3*0.5)*0.33))

In [None]:
# if not debug:   
#     sample_submission['isFraud'] = catb_y_preds
#     sample_submission.to_csv('catb_y_preds.csv')

#     FileLink('catb_y_preds.csv')