#### My own functions

In [None]:
def plot_hist(df, col, label):
    plt.hist(df[col], label = label)
    plt.legend()
    plt.show()

### 1. Data Preparation

In [None]:
# General imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os, sys, gc, warnings, random, datetime
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import math

warnings.filterwarnings('ignore')

In [None]:
########################### Helpers
#################################################################################
## -------------------
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
## ------------------- 

## -------------------
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
## -------------------

In [None]:
########################### Vars
#################################################################################
SEED = 42
seed_everything(SEED)
LOCAL_TEST = False
TARGET = 'isFraud'

In [None]:
########################### DATA LOAD
#################################################################################
print('Load Data')
train_df = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
test_df = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_df['isFraud'] = 0

train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

In [None]:
########################### Base check
#################################################################################

if LOCAL_TEST:
    for df2 in [train_df, test_df, train_identity, test_identity]:
        df = reduce_mem_usage(df2)

        for col in list(df):
            if not df[col].equals(df2[col]):
                print('Bad transformation', col)

In [None]:
########################### Base Minification
#################################################################################

train_df = reduce_mem_usage(train_df)
test_df  = reduce_mem_usage(test_df)

train_identity = reduce_mem_usage(train_identity)
test_identity  = reduce_mem_usage(test_identity)

In [None]:
########################### Columns
#################################################################################
## Main Data
# 'TransactionID',
# 'isFraud',
# 'TransactionDT',
# 'TransactionAmt',
# 'ProductCD',
# 'card1' - 'card6',
# 'addr1' - 'addr2',
# 'dist1' - 'dist2',
# 'P_emaildomain' - 'R_emaildomain',
# 'C1' - 'C14'
# 'D1' - 'D15'
# 'M1' - 'M9'
# 'V1' - 'V339'

## Identity Data
# 'TransactionID'
# 'id_01' - 'id_38'
# 'DeviceType',
# 'DeviceInfo'

### 2. Data Preprocessing

**Dataset Columns**

**Transaction Data**

'TransactionID'

'isFraud'

'TransactionDT'

'TransactionAmt'

'ProductCD' **Categorical Features**

'card1' - 'card6' **Categorical Features**

'addr1' - 'addr2' **Categorical Features**

'dist1' - 'dist2'

'P_emaildomain' **Categorical Features**

'R_emaildomain' **Categorical Features**

'C1' - 'C14'

'D1' - 'D15'

'M1' - 'M9' **Categorical Features**

'V1' - 'V339'




**Identity Data**

'TransactionID'

'id_01' - 'id_38' **Categorical Features id_12 - id_38**

'DeviceType' **Categorical Features**

'DeviceInfo' **Categorical Features**

#### 2.1 Product CD

In [None]:
plt.figure(figsize=(12,6))

total = len(train_df)

plt.subplot(121)
g = sns.countplot(x = 'ProductCD', data = train_df)
g.set_title('ProductCD Distribution', fontsize = 15)
g.set_xlabel("Product Code", fontsize=15)
g.set_ylabel("Count", fontsize=15)
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total*100),
            ha="center", fontsize=14) 

plt.subplot(122)
g1 = sns.countplot(x='ProductCD', hue='isFraud', data=train_df)
g1.set_title('ProductCD by Fraud', fontsize = 15)
g1.set_xlabel("Product Code", fontsize=15)
g1.set_ylabel("Count", fontsize=15)
plt.legend(title='Fraud', loc='best', labels=['No', 'Yes'])

In [None]:
train_df[train_df['isFraud'] == 1]['ProductCD'].value_counts(normalize = True)

In [None]:
plt.figure(figsize = (12,12))
temp = train_df.groupby('ProductCD')['isFraud'].value_counts(normalize = True).unstack()
a = temp.plot.bar(stacked = True)
a.set_title('Rate of Fraud by Product Code', fontsize = 15)
plt.xticks(rotation = 'horizontal')

These results susgest that with product code, we need to do 2 things:
* Encode categorical levels using Frequency Encoding
* Add target mean by categorical level. It is found here that product C has highest fraud rate, compared to others

In [None]:
# ProductCD Frequency Encoding
col = 'ProductCD'
temp_df = pd.concat([train_df[[col]], test_df[[col]]])   # I don't want to use test data
#temp_df = train_df[[col]]
col_encoded = temp_df[col].value_counts().to_dict()
train_df[col] = train_df[col].map(col_encoded)
test_df[col]  = test_df[col].map(col_encoded)

In [None]:
## ProductCD Target mean
for col in ['ProductCD']:
    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
    test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

In [None]:
train_df[['ProductCD', 'ProductCD_target_mean']].head()

### 2.2 Card1 - Card6

#### a. Card 1, 2, 3, 5

The card 1,2,3, and 5 was represented as numerical values, temping us to plot the histogram. However, we need to remember that card columns were classified as categorical variables. Meaning it's likely that these numerical variables were coded for categorical variables.

In [None]:
train_df.describe().loc[:,'card1':'card5']

In [None]:
train_df.loc[:,'card1':'card5'].nunique()

Card 1 contains 13553 unique values, suggesting card 1 may have been ID of the card. Card 2,3 and 5 have less unique values, so perhaps they could be expiration date, or combinations that generate card identity? Since we don't know how these information was scrammbled, we might pickup patterns generated by encryption algorithm instead of data. We can calculate target mean grouped by 'card1' - 'card5' (except 'card4'

In [None]:
## ProductCD Target mean
for col in ['card1', 'card2', 'card3', 'card5']:
    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
    test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

b. Card4

In [None]:
plt.figure(figsize=(12,6))

total = len(train_df)

plt.subplot(121)
g = sns.countplot(x = 'card4', data = train_df)
g.set_title('Card Network Distribution', fontsize = 15)
g.set_xlabel("Card Issuers", fontsize=15)
g.set_ylabel("Count", fontsize=15)
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total*100),
            ha="center", fontsize=14) 

plt.subplot(122)
g1 = sns.countplot(x='card4', hue='isFraud', data=train_df)
g1.set_title('Card Network by Fraud', fontsize = 15)
g1.set_xlabel("Card Issuers", fontsize=15)
g1.set_ylabel("Count", fontsize=15)
plt.legend(title='Fraud', loc='best', labels=['No', 'Yes'])

In [None]:
# visualization of table
plt.figure(figsize=(12,12))
b = train_df.groupby('card4')['isFraud'].value_counts(normalize = True).unstack().plot.bar(stacked = True)
b.set_title('Rate of Fraud by Card Network', fontsize = 15)
plt.xticks(rotation='horizontal')

In [None]:
# Card4 Frequency Encoding
col = 'card4'
temp_df = pd.concat([train_df[[col]], test_df[[col]]])   # I don't want to use test data
#temp_df = train_df[[col]]
col_encoded = temp_df[col].value_counts().to_dict()
train_df[col] = train_df[col].map(col_encoded)
test_df[col]  = test_df[col].map(col_encoded)

## Card4 Target mean
for col in ['card4']:
    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
    test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

c. Card6

In [None]:
plt.figure(figsize=(12,6))

total = len(train_df)

plt.subplot(121)
g = sns.countplot(x = 'card6', data = train_df)
g.set_title('Card Type Distribution', fontsize = 15)
g.set_xlabel("Card Type", fontsize=15)
g.set_ylabel("Count", fontsize=15)
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total*100),
            ha="center", fontsize=14) 

plt.subplot(122)
g1 = sns.countplot(x='card6', hue='isFraud', data=train_df)
g1.set_title('Card Type by Fraud', fontsize = 15)
g1.set_xlabel("Card Type", fontsize=15)
g1.set_ylabel("Count", fontsize=15)
plt.legend(title='Fraud', loc='best', labels=['No', 'Yes'])

In [None]:
# visualization of table
plt.figure(figsize=(12,12))
c = train_df.groupby('card6')['isFraud'].value_counts(normalize = True).unstack().plot.bar(stacked = True)
c.set_title('Rate of Fraud by Card Type', fontsize = 15)
plt.xticks(rotation='horizontal')

Credit card has much higher fraud rate as compared with other types of card

In [None]:
# Card6 Frequency Encoding
col = 'card6'
temp_df = pd.concat([train_df[[col]], test_df[[col]]])   # I don't want to use test data
#temp_df = train_df[[col]]
col_encoded = temp_df[col].value_counts().to_dict()
train_df[col] = train_df[col].map(col_encoded)
test_df[col]  = test_df[col].map(col_encoded)

## Card6 Target mean
for col in ['card6']:
    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
    test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

### 2.3 Addr1 - Addr2

In [None]:
plot_hist(train_df, 'addr1', 'addr1')

In [None]:
plot_hist(train_df, 'addr2', 'addr2')

In [None]:
plt.scatter(x='addr1', y ='isFraud', data = train_df)
plt.show()

In [None]:
# addr1 Frequency Encoding
col = 'addr1'
temp_df = pd.concat([train_df[[col]], test_df[[col]]])   # I don't want to use test data
#temp_df = train_df[[col]]
col_encoded = temp_df[col].value_counts().to_dict()
train_df[col] = train_df[col].map(col_encoded)
test_df[col]  = test_df[col].map(col_encoded)

## Addr1 - Addr2 Target mean
for col in ['addr1', 'addr2']:
    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
    test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

### 2.4 Email domain

In [None]:
train_df["P_parent_emaildomain"] = train_df["P_emaildomain"].str.split('.', expand = True)[[0]]
train_df["R_parent_emaildomain"] = train_df["R_emaildomain"].str.split('.', expand = True)[[0]]

test_df["P_parent_emaildomain"] = test_df["P_emaildomain"].str.split('.', expand = True)[[0]]
test_df["R_parent_emaildomain"] = test_df["R_emaildomain"].str.split('.', expand = True)[[0]]

In [None]:
# P-emaildomain & R-emaildomain Frequency Encoding
for col in ['P_parent_emaildomain', 'R_parent_emaildomain']:
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])   # I don't want to use test data
    #temp_df = train_df[[col]]
    col_encoded = temp_df[col].value_counts().to_dict()
    train_df[col] = train_df[col].map(col_encoded)
    test_df[col]  = test_df[col].map(col_encoded)

## P-emaildomain & R-emaildomain Target mean
for col in ['P_parent_emaildomain', 'R_parent_emaildomain']:
    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
    test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

### 2.5 'M1' - 'M9'

In [None]:
#temp = train_df[train_df['isFraud']==True].groupby('M4')['isFraud'].value_counts(normalize=True)
temp = train_df.groupby('M4')['isFraud'].value_counts(normalize = True)
temp

In [None]:
train_df['M4'].head()

In [None]:
## 'M4' Target mean
for col in ['M4']:
    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
    test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

In [None]:
for col in ['M1','M2','M3','M5','M6','M7','M8','M9']:
    train_df[col] = train_df[col].map({'T':1, 'F':0})
    test_df[col]  = test_df[col].map({'T':1, 'F':0})

In [None]:
# let's combine the data and work with the whole dataset

#train = pd.merge(train_df, train_identity, on='TransactionID', how='left')
#test = pd.merge(test_df, test_identity, on='TransactionID', how='left')
#del train_identity, train_df, test_identity, test_df


In [None]:
## Reduce Memory Usage

#train = reduce_mem_usage(train)
#test = reduce_mem_usage(test)

### 2.6 'id_12' - 'id_38'

In [None]:
#train['id_23'].value_counts()

In [None]:
# Calculate Target Mean grouped by 'id_23'
## 'id_23' Target mean
#for col in ['id_23']:
#    temp_dict = train.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
#                                                        columns={'mean': col+'_target_mean'})
#    temp_dict.index = temp_dict[col].values
#    temp_dict = temp_dict[col+'_target_mean'].to_dict()

#    train[col] = train[col].map(temp_dict)
#    test[col]  = test[col].map(temp_dict)

In [None]:
########################### Identity columns
#################################################################################

def minify_identity_df(df):

    df['id_12'] = df['id_12'].map({'Found':1, 'NotFound':0})
    df['id_15'] = df['id_15'].map({'New':2, 'Found':1, 'Unknown':0})
    df['id_16'] = df['id_16'].map({'Found':1, 'NotFound':0})

    df['id_23'] = df['id_23'].map({'TRANSPARENT':4, 'IP_PROXY':3, 'IP_PROXY:ANONYMOUS':2, 'IP_PROXY:HIDDEN':1})

    df['id_27'] = df['id_27'].map({'Found':1, 'NotFound':0})
    df['id_28'] = df['id_28'].map({'New':2, 'Found':1})

    df['id_29'] = df['id_29'].map({'Found':1, 'NotFound':0})

    df['id_35'] = df['id_35'].map({'T':1, 'F':0})
    df['id_36'] = df['id_36'].map({'T':1, 'F':0})
    df['id_37'] = df['id_37'].map({'T':1, 'F':0})
    df['id_38'] = df['id_38'].map({'T':1, 'F':0})

    df['id_34'] = df['id_34'].fillna(':0')
    df['id_34'] = df['id_34'].apply(lambda x: x.split(':')[1]).astype(np.int8)
    df['id_34'] = np.where(df['id_34']==0, np.nan, df['id_34'])
    
    df['id_33'] = df['id_33'].fillna('0x0')
    df['id_33_0'] = df['id_33'].apply(lambda x: x.split('x')[0]).astype(int)
    df['id_33_1'] = df['id_33'].apply(lambda x: x.split('x')[1]).astype(int)
    df['id_33'] = np.where(df['id_33']=='0x0', np.nan, df['id_33'])

    df['DeviceType'].map({'desktop':1, 'mobile':0})
    return df

train_identity = minify_identity_df(train_identity)
test_identity = minify_identity_df(test_identity)

for col in ['id_33']:
    train_identity[col] = train_identity[col].fillna('unseen_before_label')
    test_identity[col]  = test_identity[col].fillna('unseen_before_label')
    
    le = LabelEncoder()
    le.fit(list(train_identity[col])+list(test_identity[col]))
    train_identity[col] = le.transform(train_identity[col])
    test_identity[col]  = le.transform(test_identity[col])

In [None]:
train['DeviceInfo'].head()

In [None]:
i_cols = ['card1','card2','card3','card5',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8','D9',
          'addr1','addr2',
          'dist1','dist2',
          'P_emaildomain', 'R_emaildomain'
         ]

for col in i_cols:
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()   
    train_df[col+'_fq_enc'] = train_df[col].map(fq_encode)
    test_df[col+'_fq_enc']  = test_df[col].map(fq_encode)

### 3. Feature Engineering

#### 3.1 TransactionDT

In [None]:
START_DATE = datetime.datetime.strptime('2018-01-01', '%Y-%m-%d')

for df in [train_df, test_df]:
    # Temporary
    df['DT'] = df['TransactionDT'].apply(lambda x: START_DATE + (datetime.timedelta(seconds = x)))
    df['DT_M'] = df['DT'].dt.month
    df['DT_W'] = df['DT'].dt.weekofyear
    df['DT_D'] = df['DT'].dt.dayofyear
    
    df['DT_hour'] = df['DT'].dt.hour
    df['DT_day_week'] = df['DT'].dt.dayofweek
    df['DT_day'] = df['DT'].dt.day
    
del train_df['DT']
del test_df['DT']

#### 3.2 Transaction Amount

In [None]:
train_df['TransactionAmt_to_mean_ProductCD'] = train_df['TransactionAmt'] / train_df.groupby(['ProductCD'])['TransactionAmt'].transform('mean')
train_df['TransactionAmt_to_mean_Hour'] = train_df['TransactionAmt'] / train_df.groupby(['DT_hour'])['TransactionAmt'].transform('mean')
train_df['TransactionAmt_to_std_ProductCD'] = train_df['TransactionAmt'] / train_df.groupby(['ProductCD'])['TransactionAmt'].transform('std')
train_df['TransactionAmt_to_std_Hour'] = train_df['TransactionAmt'] / train_df.groupby(['DT_hour'])['TransactionAmt'].transform('std')

test_df['TransactionAmt_to_mean_ProductCD'] = test_df['TransactionAmt'] / train_df.groupby(['ProductCD'])['TransactionAmt'].transform('mean')
test_df['TransactionAmt_to_mean_Hour'] = test_df['TransactionAmt'] / train_df.groupby(['DT_hour'])['TransactionAmt'].transform('mean')
test_df['TransactionAmt_to_std_ProductCD'] = test_df['TransactionAmt'] / train_df.groupby(['ProductCD'])['TransactionAmt'].transform('std')
test_df['TransactionAmt_to_std_Hour'] = test_df['TransactionAmt'] / train_df.groupby(['DT_hour'])['TransactionAmt'].transform('std')

### 7. Model Running

In [None]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)
gc.collect()

In [None]:
for col in list(train_df):
    if train[col].dtype == 'O':
        print(col)

In [None]:
## Model Features 
## We can use set().difference() but order matters
rm_cols = [
    'TransactionID','TransactionDT',TARGET, 'M4', 'P_emaildomain', 'R_emaildomain'
    #'TransactionID','TransactionDT',TARGET,'DeviceType', 'DeviceInfo', 'M4', 'P_emaildomain', 'R_emaildomain', 'id_30', 'id_31', 'id_34'
]
features_columns = list(train_df)
for col in rm_cols:
    if col in features_columns:
        features_columns.remove(col)
        
for icol in features_columns:
    if train[icol].dtype == 'O':
        print('bad col: ', col)

In [None]:
## Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':1,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [None]:
########################### Model
import lightgbm as lgb

def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS):
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    X,y = tr_df[features_columns], tr_df[target]    
    P,P_y = tt_df[features_columns], tt_df[target]  

    tt_df = tt_df[['TransactionID',target]]    
    predictions = np.zeros(len(tt_df))
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold:',fold_)
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
        print(len(tr_x),len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)

        if LOCAL_TEST:
            vl_data = lgb.Dataset(P, label=P_y) 
        else:
            vl_data = lgb.Dataset(vl_x, label=vl_y)  

        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200,
        )   
        
        pp_p = estimator.predict(P)
        predictions += pp_p/NFOLDS

        if LOCAL_TEST:
            feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
            print(feature_imp)
        
        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
        
    tt_df['prediction']  = predictions
    
    return tt_df
## -------------------

In [None]:
########################### Model Train
if LOCAL_TEST:
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params)
    print(metrics.roc_auc_score(test_predictions[TARGET], test_predictions['prediction']))
else:
    lgb_params['learning_rate'] = 0.01
    lgb_params['n_estimators'] = 800
    lgb_params['early_stopping_rounds'] = 100    
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params, 10)

In [None]:
## OUTPUT
if not LOCAL_TEST:
    test_predictions['isFraud'] = test_predictions['prediction']
    test_predictions[['TransactionID','isFraud']].to_csv('submission.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn import base
from sklearn.model_selection import KFold

class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):

        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self


    def transform(self,X):

        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)

        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=2019)



        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan

        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
#             print(tr_ind,val_ind)
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())

        X[col_mean_name].fillna(mean_of_target, inplace = True)

        if self.verbosity:

            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
                                                                                      self.targetName,
                                                                                      np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
            

        return X

    
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self,train,colNames,encodedName):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
        
        
    def fit(self, X, y=None):
        return self

    def transform(self,X):


        mean = self.train[[self.colNames,self.encodedName]].groupby(self.colNames).mean().reset_index() 
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]

        #print(dd)
        
#        X[self.encodedName] = X[self.colNames]
#        X = X.replace({self.encodedName: dd})
        
        X[self.encodedName] = X[self.colNames].map(dd)
        self.train[self.encodedName] = self.train[self.colNames].map(dd) 

        return X

In [None]:
#target_encoding_list = ['ProductCD', 'card4', 'card6', 'M4', 'P_emaildomain', 'R_emaildomain']#, 'DeviceType']

#for col in target_encoding_list:
#    targetc = KFoldTargetEncoderTrain(col,'isFraud',n_fold=10)
 #   train_df = targetc.fit_transform(train_df)

  #  test_targetc = KFoldTargetEncoderTest(train_df,col,col+'_Kfold_Target_Enc')
   # test_targetc.fit_transform(test_df)

In [None]:
########################### card4, card6, ProductCD
#################################################################################
# Converting Strings to ints(or floats if nan in column) using frequency encoding
# We will be able to use these columns as category or as numerical feature

#for col in ['card4', 'card6', 'ProductCD']:
 #   print('Encoding', col)
 #   temp_df = pd.concat([train_df[[col]], test_df[[col]]])
 #   col_encoded = temp_df[col].value_counts().to_dict()   
 #   train_df[col] = train_df[col].map(col_encoded)
 #   test_df[col]  = test_df[col].map(col_encoded)
 #   print(col_encoded)

In [None]:
########################### M columns
#################################################################################
# Converting Strings to ints(or floats if nan in column)

#for col in ['M1','M2','M3','M5','M6','M7','M8','M9']:
#    train_df[col] = train_df[col].map({'T':1, 'F':0})
#    test_df[col]  = test_df[col].map({'T':1, 'F':0})

#for col in ['M4']:
 #   print('Encoding', col)
 #   temp_df = pd.concat([train_df[[col]], test_df[[col]]])
 #   col_encoded = temp_df[col].value_counts().to_dict()   
 #   train_df[col] = train_df[col].map(col_encoded)
 #   test_df[col]  = test_df[col].map(col_encoded)
 #   print(col_encoded)

In [None]:
########################### Final Minification
#################################################################################

#train_df = reduce_mem_usage(train_df)
#test_df  = reduce_mem_usage(test_df)

#train_identity = reduce_mem_usage(train_identity)
#test_identity  = reduce_mem_usage(test_identity)

In [None]:
########################### Export
#################################################################################

#train_df.to_pickle('train_transaction.pkl')
#test_df.to_pickle('test_transaction.pkl')

#train_identity.to_pickle('train_identity.pkl')
#test_identity.to_pickle('test_identity.pkl')

In [None]:
#SEED = 42
#seed_everything(SEED)
#LOCAL_TEST = False
#TARGET = 'isFraud'

In [None]:
########################### Reset values for "noise" card1
#valid_card = train_df['card1'].value_counts()
#valid_card = valid_card[valid_card>10]
#valid_card = list(valid_card.index)
    
#train_df['card1'] = np.where(train_df['card1'].isin(valid_card), train_df['card1'], np.nan)
#test_df['card1']  = np.where(test_df['card1'].isin(valid_card), test_df['card1'], np.nan)

In [None]:
## FREQUENCY ENCODING
#i_cols = ['card1','card2','card3','card5',
#          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
#          'D1','D2','D3','D4','D5','D6','D7','D8','D9',
#          'addr1','addr2',
#          'dist1','dist2',
#          'P_emaildomain', 'R_emaildomain'
#         ]

#for col in i_cols:
#    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
#    fq_encode = temp_df[col].value_counts().to_dict()   
#    train_df[col+'_fq_enc'] = train_df[col].map(fq_encode)
#    test_df[col+'_fq_enc']  = test_df[col].map(fq_encode)

In [None]:
## ProductCD and M4 Target mean
#for col in ['ProductCD','M4']:
#    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                  #      columns={'mean': col+'_target_mean'})
#    temp_dict.index = temp_dict[col].values
#    temp_dict = temp_dict[col+'_target_mean'].to_dict()

#    train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
#    test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

In [None]:
## Encode Str columns
#for col in list(train_df):
#    if train_df[col].dtype=='O':
#        print(col)
#        train_df[col] = train_df[col].fillna('unseen_before_label')
#        test_df[col]  = test_df[col].fillna('unseen_before_label')
        
#        train_df[col] = train_df[col].astype(str)
#        test_df[col] = test_df[col].astype(str)
        
#        le = LabelEncoder()
#        le.fit(list(train_df[col])+list(test_df[col]))
#        train_df[col] = le.transform(train_df[col])
#        test_df[col]  = le.transform(test_df[col])
        
#        train_df[col] = train_df[col].astype('category')
#        test_df[col] = test_df[col].astype('category')