2/2: https://www.kaggle.com/nagiss/9-solution-nagiss-part-2-2-weightshareing-nn

In [None]:
import gc
import os
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
import time
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings('ignore')

PATH="../input/"

N_SPLITS = 10
SEED_SKF = 4221

In [None]:
def merge_train_test(df_train, df_test):
    if "target" not in df_test.columns.values:
        df_test["target"] = -1
    res = pd.concat([df_train, df_test])
    res.reset_index(inplace=True, drop=True)
    return res

def split_train_test(df):
    df_train = df[df["target"] >= 0]
    df_test = df[df["target"] <= -1]
    df_train.reset_index(inplace=True, drop=True)
    df_test.reset_index(inplace=True, drop=True)
    assert list(df_train["ID_code"].values) == [f"train_{i}" for i in range(200000)]
    assert list(df_test["ID_code"].values) == [f"test_{i}" for i in range(200000)]
    return df_train, df_test

In [None]:
%%time
train_df = pd.read_csv(PATH+"train.csv")
test_df = pd.read_csv(PATH+"test.csv")

In [None]:
class CountEncoder:
    def fit(self, series):
        self.counts = series.groupby(series).count()
    
    def transform(self, series):
        return series.map(self.counts).fillna(0).astype(np.int16)

In [None]:
# separate into real and fake

df_cnt = pd.DataFrame()
for v in range(200):
    sr = test_df[f"var_{v}"]
    enc = CountEncoder()
    enc.fit(sr)
    df_cnt[f"cnt_{v}"] = enc.transform(sr)
test_df["target"] = -df_cnt.min(1)  # target==-1 -> real, target==-2 -> fake
del df_cnt

In [None]:
df_merged = merge_train_test(train_df, test_df)
df_merged.tail()

In [None]:
%%time

# count encoding

count_enc = [None] * 200
df_real = df_merged[df_merged["target"]!=-2]
for v in range(200):
    enc = CountEncoder()
    enc.fit(df_real[f"var_{v}"])
    count_enc[v] = enc.transform(df_merged[f"var_{v}"])
    
for v in range(200):
    df_merged[f"cnt_{v}"] = count_enc[v]

del df_real

In [None]:
train_df, test_df = split_train_test(df_merged)
target = train_df['target']
gc.collect()
print(train_df.shape)
test_df.head()

# 1st step - make meta features

In [None]:
param = {
    "objective": "binary", 
    "boost": "gbdt",
    "metric": "auc",
    "boost_from_average": False,
    "learning_rate": 0.01,
    "num_leaves": 5,
    "max_depth": -1,
    "tree_learner": "serial",
    "feature_fraction": 1.0,
    "bagging_freq": 5,
    "bagging_fraction": 0.4,
    "min_data_in_leaf": 80,
    "min_sum_hessian_in_leaf": 10.0,
    "verbosity": 1,
    "seed": 44000,
}

In [None]:
target = train_df['target']
df_merged_cut = [df_merged[[f"var_{v}", 
                            f"cnt_{v}", 
                           ]] for v in range(200)]
gc.collect()

In [None]:
%%time
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED_SKF)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_df.values, target.values)):
    print("fold n°{}".format(fold_))
    df_meta = df_merged[["ID_code", "target"]]
    
    trn_X, trn_y = train_df.iloc[trn_idx], target.iloc[trn_idx]
    val_X, val_y = train_df.iloc[val_idx], target.iloc[val_idx]
    for v in range(200):
        print(f"var {v}")
        features = [f"var_{v}", 
                    f"cnt_{v}", 
                   ]

        trn_data = lgb.Dataset(trn_X[features], label=trn_y)
        val_data = lgb.Dataset(val_X[features], label=val_y)

        num_round = 1000000
        clf = lgb.train(param, 
                        trn_data, 
                        num_round, 
                        valid_sets=[trn_data, val_data], 
                        verbose_eval=1000, 
                        early_stopping_rounds=100)
        df_meta[f"{v}_meta"] = clf.predict(df_merged_cut[v], num_iteration=clf.best_iteration).astype(np.float32)

    df_meta.to_pickle(f"fold_{fold_}_meta.pickle")

# 2nd step - prediction

In [None]:
param = {
    "objective": "binary", 
    "boost": "gbdt",
    "metric": "auc",
    "boost_from_average": "false",
    "learning_rate": 0.01,
    "num_leaves": 2,
    "max_depth": -1,
    "tree_learner": "serial",
    "feature_fraction": 0.5,
    "bagging_freq": 5,
    "bagging_fraction": 0.4,
    "min_data_in_leaf": 80,
    "min_sum_hessian_in_leaf": 10.0,
    "verbosity": 1,
    "seed": 44000,
}

In [None]:

%%time
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED_SKF)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_df.values, target.values)):
    print("fold n°{}".format(fold_))
    df_meta = pd.read_pickle(f"fold_{fold_}_meta.pickle")
    train_df, test_df = split_train_test(df_meta)
    features = [f"{v}_meta" for v in range(200)]
    
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 1000000
    clf = lgb.train(param, 
                    trn_data, 
                    num_round, 
                    valid_sets=[trn_data, val_data], 
                    verbose_eval=1000, 
                    early_stopping_rounds=2000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / N_SPLITS

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

In [None]:
plt.figure(figsize=(10,80))
sns.barplot(x="importance", y="feature", data=feature_importance_df.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')

In [None]:
sub_df = pd.DataFrame({"ID_code":test_df["ID_code"].values})
sub_df["target"] = predictions
sub_df.to_csv("submission.csv", index=False)

In [None]:
feature_importance_df.to_csv("feature_importance_df.csv", index=False)