---
# MODELING
---

Link to the feature engineering part : [Feature Engineering - Customer Transaction Santander](https://www.kaggle.com/jamesngoa/santander-ctp-feature-engineering)

EDA : [EDA - Customer Transaction Santander](https://www.kaggle.com/jamesngoa/eda-customer-transaction)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb

In [None]:
train = pd.read_csv('../input/ct-datasets/final_train (1).csv')
test = pd.read_csv('../input/ct-datasets/final_test (1).csv')

y = train['target']
train = train.iloc[:, :-1]

remaining_test = pd.read_csv('../input/test-ctr/fake_test.csv')[['ID_code']]
remaining_test['target'] = 0

test_index = test.ID_code
test = test.drop('ID_code', axis=1)

## Training the LGBM model

In [None]:
lgb_params = {
"objective" : "binary",
"metric" : "auc",
"boosting": 'gbdt',
"max_depth" : 1,
"num_leaves" : 13,
"learning_rate" : 0.03,
"bagging_freq": 5,
"bagging_fraction" : 0.4,
"feature_fraction" : 0.05,
"min_data_in_leaf": 80,
"min_sum_hessian_in_leaf": 10,
"tree_learner": "serial",
"boost_from_average": "false",
"bagging_seed" : 42,
"verbosity" : 1,
"seed": 42
}

In [None]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=44000)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, y.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx], label=y.iloc[val_idx])

    num_round = 15000
    clf = lgb.train(lgb_params, trn_data, num_round, valid_sets = [trn_data, val_data],
                    verbose_eval=1000, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
    
    predictions += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(y, oof)))

## Submission

In [None]:
sub_file = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv')

In [None]:
# Our predictions for the real data
sub = pd.DataFrame()
sub['ID_code'] = test_index
sub['target'] = predictions


final_sub = pd.merge(sub_file, sub, on='ID_code', how='left')[['ID_code', 'target_y']]
final_sub = final_sub.fillna(0).rename(columns={'target_y': 'target'})
final_sub

In [None]:
final_sub.to_csv('Finalsub.csv', index=False)