## Preliminaries

In [None]:
!pip install scorecardpy

In [None]:
import numpy as np
import pandas as pd
from scipy.special import logit
import lightgbm as lgb
import scorecardpy as sc

## Data Exploration

### Load the data

In [None]:
train = pd.read_csv("../input/santander-customer-transaction-prediction/train.csv")
test = pd.read_csv("../input/santander-customer-transaction-prediction/test.csv")
train = train.drop('ID_code', axis = 1)
train.head()

In [None]:
test_id = test.ID_code
test = test.drop('ID_code', axis = 1)
test.head()

### Check if there is any missing value

In [None]:
# Whether there is missing value in the training set
print(f"The number of missing values in the training set is: {np.sum(np.sum(pd.isnull(train)))}")

# Whether there is missing value in the test set
print(f"The number of missing values in the test set is: {np.sum(np.sum(pd.isnull(test)))}")

### Obtain the correlations between different variables/response

In [None]:
correlations = train.drop("target", axis = 1).corr().abs().unstack().sort_values(kind = "quicksort").reset_index()
correlations = correlations[correlations['level_0'] != correlations['level_1']]
correlations.head(10)

In [None]:
correlations.tail(10)

It is shown that the correlations between different variables are pretty small. What about the correlations between target and predictors?

In [None]:
variables = train.drop("target", axis = 1).columns.values.tolist()
corr_pre_res = np.zeros(len(variables))
i = 0
for var in variables:
    corr_pre_res[i] = np.corrcoef(train[var], train["target"])[0, 1]
    i += 1

In [None]:
corr_pre_res = abs(pd.DataFrame(corr_pre_res))
corr_pre_res.columns = ['corr_pre_res']
corr_pre_res.sort_values(by = 'corr_pre_res')

The correlations between target and variables are all small, so we should not drop some variables according to the correlations.

## Feature Exploration (by WOE & IV)

In [None]:
bins = sc.woebin(train, y = 'target', 
                 min_perc_fine_bin = 0.05, # How many bins to cut initially into
                 min_perc_coarse_bin = 0.05,  # Minimum percentage per final bin
                 stop_limit = 0.1, # Minimum information value 
                 max_num_bin = 8, # Maximum number of bins
                 method = 'tree')

sc.woebin_plot(bins)

### We can find that the original predictors are not useful enough for prediction (none of the IV is larger than 0.1). Consider constructing some new ones for better prediction.

## Feature Engineering & Model Building (refer to Dott)
[922 in 3 minutes](https://www.kaggle.com/dott1718/922-in-3-minutes/comments)

#### Since the variables are independent, use each of them combined with the frequency of each of its value as predictor to predict the probability of purchasing with LGBM, and then calculate the logit value of each probability and sum up all the logit values (by 200 predictors) to get the final result.

In [None]:
features = [x for x in train.columns if x.startswith("var")]

hist_df = pd.DataFrame()
for var in features:
    var_stats = train[var].append(test[var]).value_counts()
    hist_df[var] = pd.Series(test[var]).map(var_stats)
    hist_df[var] = hist_df[var] > 1

ind = hist_df.sum(axis = 1) != 200
var_stats = {var: train[var].append(test[ind][var]).value_counts() for var in features}

pred = 0
for var in features:
    model = lgb.LGBMClassifier(**{'learning_rate': 0.05, 
                                  'max_bin': 165, 
                                  'max_depth': 5, 
                                  'min_child_samples': 150,
                                  'min_child_weight': 0.1, 
                                  'min_split_gain': 0.0018, 
                                  'n_estimators': 41,
                                  'num_leaves': 6, 
                                  'reg_alpha': 2.0, 
                                  'reg_lambda': 2.54, 
                                  'objective': 'binary', 
                                  'n_jobs': -1})
    model = model.fit(np.hstack([train[var].values.reshape(-1, 1),
                      train[var].map(var_stats[var]).values.reshape(-1, 1)]), train["target"].values)
    pred += logit(model.predict_proba(np.hstack([test[var].values.reshape(-1, 1),
                  test[var].map(var_stats[var]).values.reshape(-1, 1)]))[:, 1])
    
pd.DataFrame({"ID_code": test_id, "target": pred}).to_csv("submission.csv", index = False)