# Santander Customer Transaction Prediction - Ensemble
## Ensemble of Random Forest, GBM, XGBoost and LightGBM

In the Kaggle competition, the objective is to identify which customer will make a transaction in the future.

**Link to the competition**: https://www.kaggle.com/c/santander-customer-transaction-prediction/  
**Type of Problem**: Classification  
**Metric for evalution**: AOC (Area Under Curve)

This Python 3 environment comes with many helpful analytics libraries installed
It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm
import xgboost
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Read Train CSV

In [None]:
input_dir = '/kaggle/input/santander-customer-transaction-prediction/'
df_train = pd.read_csv(input_dir + 'train.csv')
df_train

In [None]:
var_columns = [c for c in df_train.columns if c not in ['ID_code','target']]

X = df_train.loc[:, var_columns]
y = df_train.loc[:, 'target']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

## 3. Create Models
#### 3.a Random Forest

In [None]:
model_rf = RandomForestClassifier(class_weight='balanced',
                                 criterion='gini',
                                 max_depth=55,
                                 max_features='log2',
                                 min_samples_leaf=0.005,
                                 min_samples_split=0.005,
                                 n_estimators=190)

model_rf.fit(X_train, y_train)

#### 3.b GBM

In [None]:
model_gbm = GradientBoostingClassifier(n_estimators=5000,
                                       learning_rate=0.05,
                                       max_depth=3,
                                       subsample=0.5,
                                       validation_fraction=0.1,
                                       n_iter_no_change=20,
                                       max_features='log2',
                                       verbose=0)
model_gbm.fit(X_train, y_train)

#### 3.c LightGBM

In [None]:
lgbm_train_data = lightgbm.Dataset(X_train, label=y_train)
lgbm_valid_data = lightgbm.Dataset(X_valid, label=y_valid)

parameters = {'objective': 'binary',
              'metric': 'auc',
              'is_unbalance': 'true',
              'boosting': 'gbdt',
              'num_leaves': 63,
              'feature_fraction': 0.5,
              'bagging_fraction': 0.5,
              'bagging_freq': 20,
              'learning_rate': 0.01,
              'verbose': 0
             }

model_lgbm = lightgbm.train(parameters,
                            lgbm_train_data,
                            valid_sets=lgbm_valid_data,
                            num_boost_round=5000,
                            early_stopping_rounds=50)

#### 3.d XGBoost

In [None]:
model_xgboost = xgboost.XGBClassifier(learning_rate=0.05,
                                      max_depth=2,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.25,
                                      eval_metric='auc',
                                      verbosity=0,
                                      use_label_encoder=False)

eval_set = [(X_valid, y_valid)]

model_xgboost.fit(X_train,
                  y_train,
                  early_stopping_rounds=20,
                  eval_set=eval_set,
                  verbose=False)

## 4. Combine scores

In [None]:
y_train_pred_rf = model_rf.predict_proba(X_train)[:,1]
y_train_pred_gbm = model_gbm.predict_proba(X_train)[:,1]
y_train_pred_lgbm = model_lgbm.predict(X_train)
y_train_pred_xgboost = model_xgboost.predict_proba(X_train)[:,1]

y_valid_pred_rf = model_rf.predict_proba(X_valid)[:,1]
y_valid_pred_gbm = model_gbm.predict_proba(X_valid)[:,1]
y_valid_pred_lgbm = model_lgbm.predict(X_valid)
y_valid_pred_xgboost = model_xgboost.predict_proba(X_valid)[:,1]

y_train_pred_all = np.mean([y_train_pred_rf, y_train_pred_gbm, y_train_pred_lgbm, y_train_pred_xgboost], axis=0)
y_valid_pred_all = np.mean([y_valid_pred_rf, y_valid_pred_gbm, y_valid_pred_lgbm, y_valid_pred_xgboost], axis=0)

In [None]:
train_auc_list = [roc_auc_score(y_train, y_train_pred_rf),
                  roc_auc_score(y_train, y_train_pred_gbm),
                  roc_auc_score(y_train, y_train_pred_lgbm),
                  roc_auc_score(y_train, y_train_pred_xgboost),
                  roc_auc_score(y_train, y_train_pred_all)]

valid_auc_list = [roc_auc_score(y_valid, y_valid_pred_rf),
                  roc_auc_score(y_valid, y_valid_pred_gbm),
                  roc_auc_score(y_valid, y_valid_pred_lgbm),
                  roc_auc_score(y_valid, y_valid_pred_xgboost),
                  roc_auc_score(y_valid, y_valid_pred_all)]

pd.DataFrame({"Technique": ["Random Forest", "GBM", "LightGBM", "XGBoost", "All"],
              "Train_AUC": train_auc_list,
              "Valid_AUC": valid_auc_list
})

## 5. Score the test data
First let us import test.csv


In [None]:
df_test = pd.read_csv(input_dir + 'test.csv')
df_sample_submissions = pd.read_csv(input_dir + 'sample_submission.csv')
df_test.shape, df_sample_submissions.shape

In [None]:
X_test = df_test.loc[:, var_columns]

y_test_pred_rf = model_rf.predict_proba(X_test)[:,1]
y_test_pred_gbm = model_gbm.predict_proba(X_test)[:,1]
y_test_pred_lgbm = model_lgbm.predict(X_test)
y_test_pred_xgboost = model_xgboost.predict_proba(X_test)[:,1]

df_sample_submissions['target'] = np.mean([y_test_pred_rf, y_test_pred_gbm, y_test_pred_lgbm, y_test_pred_xgboost], axis=0)
df_sample_submissions

In [None]:
output_dir = '/kaggle/working/'
df_sample_submissions.to_csv(output_dir + "07_ensemble_scores.csv", index=False)