In [0]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,roc_auc_score
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

## Load Data

In [0]:
#Load data
train = pd.read_csv('/content/train.csv.zip')
test = pd.read_csv('/content/test.csv.zip')

In [0]:
## Use no scaling data to train LGBM
train_features = train.drop(['target','ID_code'], axis = 1)
test_features = test.drop(['ID_code'],axis = 1)
train_target = train['target']

In [0]:
train_all = pd.concat((train_features,test_features),axis = 0)

In [0]:
for f in train_all.columns:
    train_all[f+'_duplicate'] = train_all.duplicated(f,False).astype(int)
train_all['count_total_all']=train_all.iloc[:,200:400].sum(axis=1)

In [0]:
train_features = train_all.iloc[:200000]
test_features = train_all.iloc[200000:400000]

In [0]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
   
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    trn_series = round(trn_series,1)
    tst_series = round(tst_series,1)
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [8]:
train_features.shape, test_features.shape, train_target.shape

((200000, 401), (200000, 401), (200000,))

In [9]:
# Number of K-fold Splits
n_splits = 7 

splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True).split(train_features, train_target))
splits[:3]

[(array([     0,      1,      2, ..., 199996, 199998, 199999]),
  array([     6,      9,     10, ..., 199987, 199990, 199997])),
 (array([     0,      1,      2, ..., 199997, 199998, 199999]),
  array([     4,     18,     32, ..., 199972, 199978, 199985])),
 (array([     0,      1,      2, ..., 199996, 199997, 199998]),
  array([    16,     17,     20, ..., 199973, 199992, 199999]))]

In [0]:
cat_params = {
    'learning_rate':0.01,
    'max_depth':2,
    'eval_metric': 'AUC',
    'bootstrap_type': 'Bayesian',
    'bagging_temperature': 1,
    'objective': 'Logloss',
    'od_type': 'Iter',
    'l2_leaf_reg': 2,
    'allow_writing_files': False}

In [11]:
from catboost import CatBoostClassifier
oof_cb = np.zeros(len(train_features))
predictions_cb = np.zeros(len(test))

for i, (train_idx, valid_idx) in enumerate(splits):  
    print(f'Fold {i + 1}')
    x_train = np.array(train_features)
    y_train = np.array(train_target)
    trn_x = x_train[train_idx.astype(int)]
    trn_y = y_train[train_idx.astype(int)]
    val_x = x_train[valid_idx.astype(int)]
    val_y = y_train[valid_idx.astype(int)]
    
    num_round = 100000
    clf = CatBoostClassifier( num_round, task_type='GPU', early_stopping_rounds=1000,**cat_params,)
    clf.fit(trn_x, trn_y, eval_set=(val_x, val_y), cat_features=[], use_best_model=True, verbose=500)
    
    oof_cb[valid_idx] = clf.predict_proba(val_x)[:,1]

    predictions_cb += clf.predict_proba(test_features)[:,1] / 5

print("CV score: {:<8.5f}".format(roc_auc_score(train_target, oof_cb)))


Fold 1
0:	learn: 0.5336323	test: 0.5321515	best: 0.5321515 (0)	total: 11.3ms	remaining: 18m 49s
500:	learn: 0.7764010	test: 0.7620753	best: 0.7620753 (500)	total: 4.66s	remaining: 15m 25s
1000:	learn: 0.8103403	test: 0.7982503	best: 0.7982503 (1000)	total: 9.11s	remaining: 15m
1500:	learn: 0.8315673	test: 0.8208628	best: 0.8208628 (1500)	total: 13.3s	remaining: 14m 32s
2000:	learn: 0.8460483	test: 0.8359794	best: 0.8359794 (2000)	total: 17.6s	remaining: 14m 24s
2500:	learn: 0.8563699	test: 0.8463387	best: 0.8463387 (2500)	total: 21.8s	remaining: 14m 11s
3000:	learn: 0.8642739	test: 0.8541797	best: 0.8541797 (3000)	total: 26.1s	remaining: 14m 4s
3500:	learn: 0.8705668	test: 0.8604226	best: 0.8604226 (3500)	total: 30.4s	remaining: 13m 57s
4000:	learn: 0.8758339	test: 0.8656713	best: 0.8656713 (4000)	total: 35s	remaining: 14m
4500:	learn: 0.8803684	test: 0.8699176	best: 0.8699176 (4500)	total: 39.5s	remaining: 13m 59s
5000:	learn: 0.8841513	test: 0.8736697	best: 0.8736697 (5000)	total: 44

In [0]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.33,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 12,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1
}

In [13]:
oof = np.zeros(len(train_features))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
features = [c for c in train_features.columns if c not in ['ID_code', 'target']]

for i, (train_idx, valid_idx) in enumerate(splits):  
    print(f'Fold {i + 1}')
    x_train = np.array(train_features)
    y_train = np.array(train_target)
    trn_data = lgb.Dataset(x_train[train_idx.astype(int)], label=y_train[train_idx.astype(int)])
    val_data = lgb.Dataset(x_train[valid_idx.astype(int)], label=y_train[valid_idx.astype(int)])
    
    num_round = 100000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[valid_idx] = clf.predict(x_train[valid_idx], num_iteration=clf.best_iteration)
    
    predictions += clf.predict(test_features, num_iteration=clf.best_iteration) / 5

print("CV score: {:<8.5f}".format(roc_auc_score(train_target, oof)))

Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.903128	valid_1's auc: 0.883623
[2000]	training's auc: 0.914059	valid_1's auc: 0.891729
[3000]	training's auc: 0.921716	valid_1's auc: 0.896128
[4000]	training's auc: 0.927681	valid_1's auc: 0.898793
[5000]	training's auc: 0.932707	valid_1's auc: 0.900501
[6000]	training's auc: 0.93712	valid_1's auc: 0.901447
[7000]	training's auc: 0.941245	valid_1's auc: 0.901825
[8000]	training's auc: 0.945033	valid_1's auc: 0.902051
[9000]	training's auc: 0.948647	valid_1's auc: 0.902443
[10000]	training's auc: 0.952034	valid_1's auc: 0.902491
[11000]	training's auc: 0.955256	valid_1's auc: 0.902544
[12000]	training's auc: 0.958392	valid_1's auc: 0.902614
[13000]	training's auc: 0.961365	valid_1's auc: 0.902494
[14000]	training's auc: 0.964162	valid_1's auc: 0.902596
[15000]	training's auc: 0.966882	valid_1's auc: 0.902653
[16000]	training's auc: 0.969434	valid_1's auc: 0.90258
[17000]	training's auc: 0.97

## Ensemble two model

In [14]:
esemble_lgbm_cat = 0.5*oof_cb+0.5*oof
print('LightBGM auc = {:<8.5f}'.format(roc_auc_score(train_target, oof)))
print('catboost auc = {:<8.5f}'.format(roc_auc_score(train_target, oof_cb)))
print('LightBGM+catboost auc = {:<8.5f}'.format(roc_auc_score(train_target, esemble_lgbm_cat)))

LightBGM auc = 0.90368 
catboost auc = 0.91384 
LightBGM+catboost auc = 0.91033 


In [0]:
esemble_pred_lgbm_cat = 0.5*predictions+0.5*predictions_cb

In [0]:
id_code_test = test['ID_code']

## Submissions

In [0]:
my_submission_lbgm = pd.DataFrame({"ID_code" : id_code_test, "target" : predictions})
my_submission_cat = pd.DataFrame({"ID_code" : id_code_test, "target" : predictions_cb})
my_submission_esemble_lgbm_cat = pd.DataFrame({"ID_code" : id_code_test, "target" : esemble_pred_lgbm_cat})

In [0]:
my_submission_lbgm.to_csv('submission_lbgm.csv', index = False, header = True)
my_submission_cat.to_csv('submission_cb.csv', index = False, header = True)
my_submission_esemble_lgbm_cat.to_csv('my_submission_esemble_lgbm_cat.csv', index = False, header = True)


### LightGBM 
* Fine tune parameters (0.898 to 0.899)

### Ensemble LightGBM and catboost
* Leaderboard Scores:
  - Public: 0.90215
  - Private: 0.90083
