### Load Packages

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
import seaborn as sns

import gc, os, sys, re, time
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from collections import Counter, defaultdict
import matplotlib.pyplot as plt

### Load CSV data

In [None]:
path='../input/ieee-fraud-detection'
os.listdir(path)

In [None]:
train_identity=pd.read_csv(os.path.join(path,'train_identity.csv'))
train_transaction=pd.read_csv(os.path.join(path,'train_transaction.csv'))
print('Training Dataset is loaded')

test_identity=pd.read_csv(os.path.join(path,'test_identity.csv'))
test_transaction=pd.read_csv(os.path.join(path,'test_transaction.csv'))
print('Test Dataset is loaded')

### Checking the target class

Transaction Table<br>
Categorical Features:
* ProductCD
* card1 - card6
* addr1, addr2
* P_emaildomain
* R_emaildomain
* M1 - M9

Identity Table<br>
Categorical Features:
* DeviceType
* DeviceInfo
* id_12 - id_38

[reference discussion](https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203)

In [None]:
x = train_transaction['isFraud'].value_counts().values

ax = sns.barplot([0,1],x)
ax.set(title='Class distribution w.rt target variables', xlabel = 'Target Class Count', ylabel='Count')

This graph shows the distribution of the target class, Fraud.

In [None]:
# Merge data (transaction and identity)
train = train_transaction.merge(train_identity, how='left',left_index=True, right_index=True)
y_train = train['isFraud'].astype('uint8')
print('Train shape = ', train.shape)

# Delete train identity and transaction for system memory
del train_identity, train_transaction
print(train.info())

In [None]:
# Also Merge Test data
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

del test_identity, test_transaction

### For preventing from memory break
To reduce memory usage, int value goes to int8 <br>
float value goes to float16<br>

In [None]:
def reduce_memory_usage(df):
    start_memory = df.memory_usage().sum() / 1024**2
    print('Memory Usage in beginning is {:.2f} MB'.format(start_memory))
    column = df.columns
    for col in column:
        coltype = df[col].dtype
        if coltype != 'object':
            cmin = df[col].min()
            cmax = df[col].max()
            if str(coltype)[:3] == 'int':
                if cmin>np.iinfo(np.int8).min and cmax < np.iinfo(np.int8).max:
                    df[col]=df[col].astype(np.int8)
                elif cmin>np.iinfo(np.int16).min and cmax < np.iinfo(np.int16).max:
                    df[col]=df[col].astype(np.int16)
                elif cmin>np.iinfo(np.int32).min and cmax < np.iinfo(np.int32).max:
                    df[col]=df[col].astype(np.int32)
                elif cmin>np.iinfo(np.int64).min and cmax < np.iinfo(np.int64).max:
                    df[col]=df[col].astype(np.int64)  
            else:
                if cmin>np.finfo(np.float16).min and cmax < np.finfo(np.float16).max:
                    df[col]=df[col].astype(np.float16)
                elif cmin>np.finfo(np.float32).min and cmax < np.finfo(np.float32).max:
                    df[col]=df[col].astype(np.float32)
                else:
                    df[col]=df[col].astype(np.float64)
        else:
            df[col]=df[col].astype('category')
    end_memory=df.memory_usage().sum() / 1024**2
    diff= start_memory - end_memory
    print('The memory now is {} MB'.format(end_memory))
    print('Memory is reduced to a tune of {:.2f}%'.format(100*(diff/start_memory)))
    return df      

In [None]:
train=reduce_memory_usage(train)
test=reduce_memory_usage(test)

For reducing memory usage, we can save 72% of the memory.

### Match ID section
The ID value has two methods to be represented, id_17, id:17. <br>
To match both, it makes finding different column function.

In [None]:
def differentcols(df_train,df_test):
    for i in df_train:
        if i not in df_test:
            print(i)

#### 여러번 일일이 써서 맞추는게 싫어서 짠 코든데 안돌아간다...왜지

In [None]:
def differentcols2(df_train, df_test):
    for i in df_train:
        if i not in df_test:
            if i != 'isFraud':
                print(i)
                i = i[:2]+"_"+i[3:]

In [None]:
differentcols(train,test)

In [None]:
test=test.rename(columns={'id-01':'id_01','id-02':'id_02','id-03':'id_03','id-04':'id_04','id-05':'id_05',
                         'id-06':'id_06','id-07':'id_07','id-08':'id_08','id-09':'id_09',
                         'id-10':'id_10','id-11':'id_11','id-12':'id_12','id-13':'id_13','id-14':'id_14',
                         'id-15':'id_15','id-16':'id_16','id-17':'id_17','id-18':'id_18','id-19':'id_19',
                         'id-20':'id_20','id-21':'id_21','id-22':'id_22','id-23':'id_23','id-24':'id_24',
                         'id-25':'id_25','id-26':'id_26','id-27':'id_27','id-28':'id_28','id-29':'id_29',
                         'id-30':'id_30','id-31':'id_31','id-32':'id_32','id-33':'id_33','id-34':'id_34',
                         'id-35':'id_35','id-36':'id_36','id-37':'id_37','id-38':'id_38'})

In [None]:
test_id=test['TransactionID_x']
differentcols(train,test)

Only isFraud is remained.

### Missing Value

In [None]:
def get_missing_values(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum() / data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['total', 'percent'])
    return missing_data

In [None]:
missing_data_train = get_missing_values(train)
missing_data_train.head(100).T

In [None]:
missing_data_test=get_missing_values(test)
missing_data_test.head(100).T

In [None]:
# drop missing data
dropped_cols=missing_data_train[missing_data_train['total']>100000].index

train=train.drop(dropped_cols,axis=1)
test=test.drop(dropped_cols,axis=1)

dropped_cols

Most missing values are dropped out.<br>
However, there are still a variety of missing values.

In [None]:
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

print('Train shape: ', train.shape)
print('Test shape: ', test.shape)

In [None]:
missing_data_train=get_missing_values(train)
missing_data_test=get_missing_values(test)
dropped_cols_train=missing_data_train[missing_data_train['percent']>0].index
dropped_cols_test=missing_data_test[missing_data_test['percent']>0].index

Dropout useless features - email domain

In [None]:
train['P_emaildomain'].value_counts()

In [None]:
train.drop('P_emaildomain',axis=1,inplace = True)
test.drop('P_emaildomain',axis=1,inplace = True)

In [None]:
missing_data_train=get_missing_values(train)
missing_data_test=get_missing_values(test)
missing_data_train

Drop train and test data columns which have more than 15000 null data.

In [None]:
dropped_cols_train_1=missing_data_train[missing_data_train['total']>15000].index
train.drop(dropped_cols_train_1,axis=1,inplace=True)
test.drop(dropped_cols_train_1,axis=1,inplace=True)

print('Train shape: ', train.shape)
print('Test shape: ', test.shape)

#### Find missing values again after concatnate the train and test data

In [None]:
ntrain=train.shape[0] # Train shape
ntest=test.shape[0] # Test shape

all_data = pd.concat([train, test], axis=0, sort=False)
all_data.shape

In [None]:
all_data_cols=all_data.columns
for i in all_data_cols:
    if all_data[i].dtype=='object':
        all_data[i]=all_data[i].fillna(all_data[i].mode()[0])

In [None]:
missing_data=get_missing_values(all_data)
missing_data.head(100).T

Replace the variable starting with 'C' or 'V' with the mode value


In [None]:
for i in all_data_cols:
    if (i.startswith("C") or (i.startswith("V"))) and all_data[i].isnull().sum() > 0:
        all_data[i]=all_data[i].fillna(all_data[i].mode()[0])

In [None]:
missing_data=get_missing_values(all_data)
missing_data.sort_values('total',ascending=False)

The variables which do not have any common names should be filled manually.

In [None]:
all_data['card3']=all_data['card3'].fillna(all_data['card3'].mode()[0])
all_data['D1']=all_data['D1'].fillna(all_data['D1'].mode()[0])
all_data['card2']=all_data['card2'].fillna(all_data['card2'].mode()[0])
all_data['card4']=all_data['card4'].fillna(all_data['card4'].mode()[0])
all_data['card5']=all_data['card5'].fillna(all_data['card5'].mode()[0])
all_data.isnull().sum()

Seperate train and test values using ntrain and ntest.

In [None]:
train=all_data[:ntrain]
test=all_data[ntrain:]

test.drop(['isFraud'], axis=1, inplace=True)

### Preview of train and test dataset

In [None]:
ntrain=train.shape[0]
ntest=test.shape[0]
print('Train shape : ', ntrain)
print('Test shape : ', ntest)
alldata=pd.concat([train,test],axis=0,sort=False)

In [None]:
print('Previous all data shape : ', alldata.shape)
alldata=pd.get_dummies(alldata)
print('After all data shape : ', alldata.shape)

In [None]:
train=alldata[:ntrain]
test=alldata[ntrain:]

print('Train shape : ', train.shape)
print('Test shape : ', test.shape)

Drop useless feature

In [None]:
target = train['isFraud']

In [None]:
train.drop(['TransactionID_x'],axis=1,inplace=True)
test.drop(['TransactionID_x'],axis=1,inplace=True)

train.drop(['isFraud'],axis=1,inplace=True)
test.drop(['isFraud'],axis=1,inplace=True)


print('Train shape : ', train.shape)
print('Test shape : ', test.shape)

Total number of feature becomes 120 from 122.

### Overview of Features

[EDA Reference](https://www.kaggle.com/pavitrasprabhu/eda-on-fraud-detection-dataset-in-python)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_set=alldata[:ntrain]
test_set=alldata[ntrain:]

In [None]:
Cx_plot_train = (train.filter(regex=("^C.*")))

corr_train = Cx_plot_train.corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 250, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_train, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(Cx variables: train set)")

## Correlation heatmap for test set
Cx_plot_test=(test.filter(regex=("^C.*")))

corr_test = Cx_plot_test.corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 145, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_test, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(Cx variables: test set)")

del Cx_plot_test
del Cx_plot_train
gc.collect()

D_ features are delected since they have a lot of missing data

In [None]:
Dx_plot_train = (train.filter(regex=("^D.*")))

corr_train = Dx_plot_train.corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 250, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_train, cmap=cmap, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(Dx variables: train set)")


## Correlation heatmap for test set
Dx_plot_test=(test.filter(regex=("^D.*")))

corr_test = Dx_plot_test.corr(method="pearson")
fig, ax = plt.subplots(figsize=(7, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 145, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_test, cmap=cmap, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Pairwise Correlation heatmap \n(Dx variables: test set)")


del Dx_plot_test
del Dx_plot_train
gc.collect()

In [None]:
# Train
Vxxx_plot_train = (train.filter(regex=("^V.*")).reset_index())

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

Vxxx_plot_train.fillna(-999,inplace=True)
pca = PCA(n_components=20)
pc = pca.fit_transform(Vxxx_plot_train)
print(pca.explained_variance_ratio_)

del Vxxx_plot_train
gc.collect()

PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.show()

In [None]:
# Test
Vxxx_plot_test = (test.filter(regex=("^V.*")).reset_index())

Vxxx_plot_test.fillna(-999,inplace=True)
pca = PCA(n_components=20)
pc = pca.fit_transform(Vxxx_plot_test)
print(pca.explained_variance_ratio_)

del Vxxx_plot_test
gc.collect()

PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.show()

In [None]:
card_plt = (train_set[['card1','card2','card3','card5','isFraud','TransactionAmt']]
.melt(id_vars=['isFraud','TransactionAmt']))

# Distribution of Transaction amount of Fraudulent transactions vs legitimate transactions grouped by card details
g = sns.catplot(data=card_plt,
                x="isFraud", y="value",kind="box",
                col = "variable",hue="isFraud",sharey=False,sharex=False)
## Drop 2 Variables
card_drop_var = ['card1','card2']

### Build XGB model

In [None]:
import xgboost as xgb
xgmodel = xgb.XGBClassifier()
xgmodel.fit(train, target)

In [None]:
y_pred  = xgmodel.predict_proba(test)
print('Prediction shape : ', y_pred.shape)

In [None]:
sub=pd.DataFrame()
sub['Transaction Id']=test_id
sub['isFraud']=y_pred[:,1]
sub.to_csv('Predictions.csv',index=False)
sub.head()

### Build LGBM model

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn import linear_model

In [None]:
n_fold = 5
fols = TimeSeriesSplit(n_splits=n_fold)
folds = KFold(n_splits=5)

In [None]:
from itertools import product
from numba import jit

import altair as alt
from altair.vega import v5
from IPython.display import HTML

# using ideas from this kernel: https://www.kaggle.com/notslush/altair-visualization-2018-stackoverflow-survey
def prepare_altair():
    """
    Helper function to prepare altair for working.
    """

    vega_url = 'https://cdn.jsdelivr.net/npm/vega@' + v5.SCHEMA_VERSION
    vega_lib_url = 'https://cdn.jsdelivr.net/npm/vega-lib'
    vega_lite_url = 'https://cdn.jsdelivr.net/npm/vega-lite@' + alt.SCHEMA_VERSION
    vega_embed_url = 'https://cdn.jsdelivr.net/npm/vega-embed@3'
    noext = "?noext"
    
    paths = {
        'vega': vega_url + noext,
        'vega-lib': vega_lib_url + noext,
        'vega-lite': vega_lite_url + noext,
        'vega-embed': vega_embed_url + noext
    }
    
    workaround = f"""    requirejs.config({{
        baseUrl: 'https://cdn.jsdelivr.net/npm/',
        paths: {paths}
    }});
    """
    
    return workaround
    

def add_autoincrement(render_func):
    # Keep track of unique <div/> IDs
    cache = {}
    def wrapped(chart, id="vega-chart", autoincrement=True):
        if autoincrement:
            if id in cache:
                counter = 1 + cache[id]
                cache[id] = counter
            else:
                cache[id] = 0
            actual_id = id if cache[id] == 0 else id + '-' + str(cache[id])
        else:
            if id not in cache:
                cache[id] = 0
            actual_id = id
        return render_func(chart, id=actual_id)
    # Cache will stay outside and 
    return wrapped
           

@add_autoincrement
def render(chart, id="vega-chart"):
    """
    Helper function to plot altair visualizations.
    """
    chart_str = """
    <div id="{id}"></div><script>
    require(["vega-embed"], function(vg_embed) {{
        const spec = {chart};     
        vg_embed("#{id}", spec, {{defaultStyle: true}}).catch(console.warn);
        console.log("anything?");
    }});
    console.log("really...anything?");
    </script>
    """
    return HTML(
        chart_str.format(
            id=id,
            chart=json.dumps(chart) if isinstance(chart, dict) else chart.to_json(indent=None)
        )
    )
    

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
    

@jit
def fast_auc(y_true, y_prob):
    """
    fast roc_auc computation: https://www.kaggle.com/c/microsoft-malware-prediction/discussion/76013
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc


def eval_auc(y_true, y_pred):
    """
    Fast auc eval function for lgb.
    """
    return 'auc', fast_auc(y_true, y_pred), True


def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()
    

def train_model_regression(X, X_test, y, params, folds=None, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000, splits=None, n_folds=3):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    splits = folds.split(X) if splits is None else splits
    n_splits = folds.n_splits if splits is None else n_folds
    
    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                        'catboost_metric_name': 'MSE',
                        'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    
    result_dict = {}
    
    # out-of-fold predictions on train data
    oof = np.zeros(len(X))
    
    # averaged predictions on train data
    prediction = np.zeros(len(X_test))
    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(splits):
        if verbose:
            print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = n_estimators, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                    verbose=verbose, early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                      loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        if eval_metric != 'group_mae':
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred    
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_splits
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
        
    return result_dict
    


def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000, splits=None, n_folds=3, averaging='usual', n_jobs=-1):
    """
    A function to train a variety of classification models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    n_splits = folds.n_splits if splits is None else n_folds
    X_test = X_test[columns]
    
    # to set up scoring parameters
    metrics_dict = {'auc': {'lgb_metric_name': eval_auc,
                        'catboost_metric_name': 'AUC',
                        'sklearn_scoring_function': metrics.roc_auc_score},
                    }
    
    result_dict = {}
    if averaging == 'usual':
        # out-of-fold predictions on train data
        oof = np.zeros((len(X), 1))

        # averaged predictions on train data
        prediction = np.zeros((len(X_test), 1))
        
    elif averaging == 'rank':
        # out-of-fold predictions on train data
        oof = np.zeros((len(X), 1))

        # averaged predictions on train data
        prediction = np.zeros((len(X_test), 1))

    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
        if model_type == 'lgb':
            model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs = n_jobs)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                    verbose=verbose, early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1]
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            
            y_pred = model.predict_proba(X_test)
        
        if model_type == 'cat':
            model = CatBoostClassifier(iterations=n_estimators, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                      loss_function=Logloss)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        if averaging == 'usual':
            
            oof[valid_index] = y_pred_valid.reshape(-1, 1)
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
            
            prediction += y_pred.reshape(-1, 1)

        elif averaging == 'rank':
                                  
            oof[valid_index] = y_pred_valid.reshape(-1, 1)
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
                                  
            prediction += pd.Series(y_pred).rank().values.reshape(-1, 1)        
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_splits
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
            result_dict['top_columns'] = cols
        
    return result_dict

# setting up altair
workaround = prepare_altair()
HTML("".join((
    "<script>",
    workaround,
    "</script>",
)))

In [None]:
params = {'num_leaves': 256,
          'min_child_samples': 79,
          'objective': 'binary',
          'max_depth': 13,
          'learning_rate': 0.03,
          "boosting_type": "gbdt",
          "subsample_freq": 3,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3,
          'reg_lambda': 0.3,
          'colsample_bytree': 0.9,
         }
#result_dict_lgb = train_model_classification(X=train, X_test=test, y=y, params=params, folds=folds, model_type='lgb', eval_metric='auc', plot_feature_importance=True,
#                                                      verbose=500, early_stopping_rounds=200, n_estimators=5000, averaging='usual', n_jobs=-1)

In [None]:
#sub['isFraud'] = result_dict_lgb['prediction']
#sub.to_csv('submission.csv', index=False)

#pd.DataFrame(result_dict_lgb['oof']).to_csv('lgb_oof.csv', index=False)
#sub.head()