In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install xfeat

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
#from matplotlib_venn import venn2
import category_encoders as ce
%matplotlib inline

from xfeat import (SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, 
                   ArithmeticCombinations, TargetEncoder, aggregation, GBDTFeatureSelector, GBDTFeatureExplorer)

from catboost import CatBoost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from catboost import Pool
from catboost import cv
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm

import os
from glob import glob

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import shap
import gc

from optuna.integration import _lightgbm_tuner as lgb_tuner
import optuna
from collections import Counter
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings('ignore')

In [None]:
from contextlib import contextmanager
from time import time

@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time()
    yield
    d = time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)
        
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

# Load Data

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv")
sub_df = pd.read_csv("../input/tabular-playground-series-mar-2021/sample_submission.csv")

In [None]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

In [None]:
train_df.head()

In [None]:
# baseBlock 

class BaseBlock(object):
    def fit(self, input_df, y=None):
        return self.transform(input_df)
    
    def transform(self, input_df):
        raise NotImplementedError()
        
# CountEncoding
class CountEncodingBlock(BaseBlock):
    def __init__(self, cols):
        self.cols = cols
        self.encoder = None
        
    def fit(self, input_df, y=None):
        return self.transform(input_df[self.cols])
    
    def transform(self, input_df):
        self.encoder = ce.CountEncoder()
        self.encoder.fit(input_df[self.cols])
        return self.encoder.transform(input_df[self.cols]).add_prefix("CE_")
    
# OneHotEncoding
class OneHotEncodingBlock(BaseBlock):
    def __init__(self, cols):
        self.cols = cols
        self.encoder = None
        
    def fit(self, input_df, y=None):
        self.encoder = ce.OneHotEncoder(use_cat_names=True)
        self.encoder.fit(input_df[self.cols])
        return self.transform(input_df[self.cols])
    
    def transform(self, input_df):
        return self.encoder.transform(input_df[self.cols]).add_prefix("OHE_")

# OrdinalEncoding
class OrdinalEncodingBlock(BaseBlock):
    def __init__(self, cols):
        self.cols = cols
        self.encoder = None
        
    def fit(self, input_df, y=None):
        self.encoder = ce.OrdinalEncoder()
        self.encoder.fit(input_df[self.cols])
        return self.transform(input_df[self.cols])
    
    def transform(self, input_df):
        return self.encoder.transform(input_df[self.cols]).add_prefix("OE_")

In [None]:
def get_ce_features(input_df):
    _input_df = pd.concat([input_df], axis=1)
    
    cols = [
        "cat0",
        "cat1",
        "cat2",
        "cat3",
        "cat4",
        "cat5",
        "cat6",
        "cat7",
        "cat8",
        "cat9",
        "cat10",
        "cat11",
        "cat12",
        "cat13",
        "cat14",
        "cat15",
        "cat16",
        "cat17",
        "cat18",
    ]
    
    encoder = CountEncodingBlock(cols = cols)
    output_df = encoder.fit(_input_df.astype(str))
    return output_df

def get_oe_features(input_df):
    _input_df = pd.concat([input_df], axis=1)
    cols = [
        "cat0",
        "cat1",
        "cat2",
        "cat3",
        "cat4",
        "cat5",
        "cat6",
        "cat7",
        "cat8",
        "cat9",
        "cat10",
        "cat11",
        "cat12",
        "cat13",
        "cat14",
        "cat15",
        "cat16",
        "cat17",
        "cat18",
    ]
    encoder = OrdinalEncodingBlock(cols = cols)
    output_df = encoder.fit(_input_df)
    return output_df

def get_ohe_features(input_df):
    _input_df = pd.concat([input_df], axis=1)
    cols = [
        "cat0",
        "cat1",
        "cat2",
        "cat3",
        "cat4",
        "cat5",
        "cat6",
        "cat7",
        "cat8",
        "cat9",
        "cat10",
        "cat11",
        "cat12",
        "cat13",
        "cat14",
        "cat15",
        "cat16",
        "cat17",
        "cat18",
    ]
    encoder = OneHotEncodingBlock(cols=cols)
    output_df = encoder.fit(_input_df)
    return output_df

# numeric_feature
def create_numeric_feature(input_df):
    use_columns = [
        "cont0",
        "cont1",
        "cont2",
        "cont3",
        "cont4",
        "cont5",
        "cont6",
        "cont7",
        "cont8",
        "cont9",
        "cont10",
    ]

    return input_df[use_columns].copy()

In [None]:
def to_features(train, test):
    input_df = pd.concat([train, test]).reset_index(drop=True)

    processes = [
        get_oe_features,
        get_ce_features,
        get_ohe_features,
        create_numeric_feature,
    ]

    output_df = pd.DataFrame()
    for func in tqdm(processes):
        _df = func(input_df)
        assert len(_df) == len(input_df), func.__name__
        output_df = pd.concat([output_df, _df], axis=1)

    train_x = output_df.iloc[:len(train)] 
    test_x = output_df.iloc[len(train):].reset_index(drop=True)
    return train_x, test_x

In [None]:
target_data = "target" 

train_x, test_x = to_features(train_df, test_df)
train_ys = train_df[target_data]

# LGBM

In [None]:
# ligthGBM
def fit_lgbm(X, y, cv, params: dict=None, verbose: int=100):
    metric_func = roc_auc_score
    if params is None:
        params = {}

    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgb.LGBMClassifier(**params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)],  
                    early_stopping_rounds=verbose,
                    verbose=verbose)

        pred_i = clf.predict_proba(x_valid)[:, 1]

        oof_pred[idx_valid] = pred_i
        models.append(clf)

        print(f'Fold {i} ROC: {metric_func(y_valid, pred_i) :.4f}')

    score = metric_func(y, oof_pred) 
    print('FINISHED | Whole ROC: {:.4f}'.format(score))
    return oof_pred, models

params = {
    'learning_rate': 0.01,
    'metric': 'auc',
    'n_estimators': 10000,
    'num_leaves': 20067,
    'max_depth': 27,
    'reg_alpha': 9.630576598001266,
    'reg_lambda': 2.346945113164939,
    'colsample_bytree': 0.29858836720777177,
    'subsample': 0.6267448547447422,
}

In [None]:
train_x = reduce_mem_usage(train_x)

In [None]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=71)
cv = list(fold.split(train_x, train_ys))

oof, models = fit_lgbm(train_x.values, train_ys, cv, params=params)

In [None]:
def visualize_importance(models, feat_train_df):
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importances_
        _df['column'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(max(6, len(order) * .4), 7))
    sns.boxenplot(data=feature_importance_df, x='column', y='feature_importance', order=order, ax=ax, palette='viridis')
    ax.tick_params(axis='x', rotation=90)
    ax.grid()
    fig.tight_layout()
    return fig, ax

In [None]:
fig, ax = visualize_importance(models, train_x)

In [None]:
pred1 = np.array([model.predict_proba(test_x.values)[:, 1] for model in models])
pred1 = np.mean(pred1, axis=0)

In [None]:
sub_df["target"] = pred1
sub_df.to_csv('submission.csv', index=False)

In [None]:
sub_df.head()