In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

1. Split training data in, train and validation set. 
   Check distribution of dependent in both the sets, it should align. 
   For more robust split, PSI can be calculated.
2. **Baseline model.**
    Feature Engineering <br>
    Confirm linear distribution on features or transformations. (square, root, log, 1/3) <br>
    Cap values at floor of 0. Essentially making all negative values as 0 and only keeping positives as is.<br>
    Note: Missing values could mean data category and could be predictive feature separating the dependent.<br>
    When too many features and calculating correlation is not possible.<br>
    Calculate R2/ K-S for each independent and select top 5-7 by forward feature selection from top ~90.<br>
    Restrictions for features; <br>
    a. VIF < 2/3<br>
    b. Contribution of each feature (any shouldn't be more than 50%)<br>
    c. Significance<br>
    d. Rank order. - Predictions should follow rank order, if not add more features<br>
    Else: Select one feature from each cluster of correlated features.<br>
    Best feature from each cluster by best R2.
    
    
3. **Final model.**
    Feature Engineering/ Selection<br>
    Throw all the features to XGBoost or Random Forest. and reduce number of features by selecting top features, this on small sample ~100k<br>
    Variable Reduction is done in 2 or more steps. Depends how manyvariables you are reducing. Feature selection is done based on model performance later.<br>
    Perform broad grid search using small sample, 100k.<br>
    Perform narrow grid search based on best parameters from broad search.<br>
    Most important parameter **Min Child Weight** this would help take care of model overfitting. Keeping the trees shorter hence generalising better.<br>

    Measure performance on validation set (ITV)<br>
    Also on hold-out set (OTV)<br>
    If performance within 5-10% range then model is fine.<br>
    If underperforming on ITV, then overfitting. rebuild model by tuning parameters. Shorten tree depth. reduce learning rate, increase min child weight.<br>
    Rank Order: If predictions not sharing rank order then re-do the model, may be add more variable. Typically adding variables will improve rank order. Alt method    would be selecting/keeping variables in the model which do not disturb the rank order between pred vs actual
    
4. **Compare with linear model.**
    Performance of final model should be better than the linear model, only reason this does not hold true is when all features are already linear and hence linear model explains everything and leaving not scope for XGBoost to shine. Or also when boosting algorithm is not adding new to the model. Model is not able to learn anything from the errors

     
     

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd
train_df = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')
test_df = pd.read_csv('../input/santander-customer-transaction-prediction/test.csv')
train_df.head()

In [None]:
train_df.target.value_counts()

In [None]:
df_desc = train_df.describe()
df_desc

In [None]:
def plot_feature_scatter(df1, df2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(4,4,figsize=(14,14))

    for feature in features:
        i += 1
        plt.subplot(4,4,i)
        plt.scatter(df1[feature], df2[feature], marker='+')
        plt.xlabel(feature, fontsize=9)
    plt.show();

In [None]:
features = ['var_0', 'var_1','var_2','var_3', 'var_4', 'var_5', 'var_6', 'var_7', 
           'var_8', 'var_9', 'var_10','var_11','var_12', 'var_13', 'var_14', 'var_15', 
           ]
plot_feature_scatter(train_df[::20],test_df[::20], features)

In [None]:
def plot_feature_distribution(df1, df2, label1, label2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(10,10,figsize=(18,22))

    for feature in features:
        i += 1
        plt.subplot(10,10,i)
        sns.distplot(df1[feature], hist=False,label=label1)
        sns.distplot(df2[feature], hist=False,label=label2)
        plt.xlabel(feature, fontsize=9)
        locs, labels = plt.xticks()
        plt.tick_params(axis='x', which='major', labelsize=6, pad=-6)
        plt.tick_params(axis='y', which='major', labelsize=6)
    plt.show();

In [None]:
t0 = train_df.loc[train_df['target'] == 0]
t1 = train_df.loc[train_df['target'] == 1]
features = train_df.columns.values[2:102]
plot_feature_distribution(t0, t1, '0', '1', features)

In [None]:
features = train_df.columns.values[102:]
plot_feature_distribution(t0, t1, '0', '1', features)

In [None]:
correlations = train_df[features].corr().abs().unstack().sort_values(kind="quicksort").reset_index()
correlations = correlations[correlations['level_0'] != correlations['level_1']]
correlations.head(10)

In [None]:
correlations.tail(10)

Maximum correlation we see is 0.009, which is not significant. Hence assuming no correlation between indepedent variables

In [None]:
## Calculating VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

try: 
    vif_data = pd.read_csv('VIF.csv')
except FileNotFoundError:
    features = train_df.columns[2:]
    vif_data = pd.DataFrame()
    vif_data["feature"] = features
    # calculating VIF for each feature
    vif_data["VIF"] = [variance_inflation_factor(train_df[features].values, i)
                              for i in range(len(features))]
    
    vif_data.to_csv('VIF.csv', index = False)

In [None]:
vif_data.sort_values(by = 'VIF', ascending = False, inplace = True)  
print(vif_data.head(10))

In [None]:
print(len(vif_data.loc[vif_data.VIF > 2]))

In [None]:
vif_data.loc[vif_data.VIF > 2]

# Building logistic regression model against each variable and calculating ks value to rank best variables

### Scaling data

In [None]:
from sklearn import preprocessing

features = train_df.columns[2:]
scaler = preprocessing.StandardScaler().fit(train_df[features])
X_scaled = scaler.transform(train_df[features])

## Creating dataframe with scaled values with same format 
train_df_scaled = pd.DataFrame()
train_df_scaled['ID_code'] = train_df['ID_code']
train_df_scaled['target'] = train_df['target']
train_df_scaled[features] = X_scaled
train_df_scaled.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from scipy.stats import ks_2samp


kss = []
features = train_df.columns[2:]
for feature in features:
    X = train_df_scaled[feature].to_numpy().reshape(-1, 1)
    y = train_df_scaled['target']
    clf = LogisticRegression(random_state=0).fit(X, y)
    probs = clf.predict_proba(X)
    res = pd.DataFrame({'p': probs[:,1], 'y' : y})

    ks = ks_2samp(res.loc[res.y==0,"p"], res.loc[res.y==1,"p"]).statistic
    kss.append(ks)
    
ks_df = pd.DataFrame({'feature':features, 'ks': kss})
ks_df.sort_values(by = 'ks', ascending = False, inplace = True)
print(ks_df.head())

Now that we have most important features for logistic regression, next step is to build logistic regression by adding features in forward addition and keep track of KS metric. Stop when metric stops improving.

Features requirements for logistic regression.
1. No feature should contribute more than 50%
2. Not correlated, VIF < 2-3

In [None]:
## Top 10 features
ks_df.feature.to_list()[:10]

In [None]:
## Checking KS using all variables
## Scikit Learn Logistic Regression

X = train_df_scaled[features]
y = train_df_scaled['target']
clf = LogisticRegression(random_state=0).fit(X_scaled, y)
probs = clf.predict_proba(X)
res = pd.DataFrame({'p': probs[:,1], 'y' : y})

ks = ks_2samp(res.loc[res.y==0,"p"], res.loc[res.y==1,"p"]).statistic
print('KS using Scikit and all features: ',ks)

#print(res.head())

In [None]:
import statsmodels.api as sm

## statsmodels Logistic Regression
X = train_df_scaled[features]
y = train_df_scaled['target']
log_reg = sm.Logit(y, X_scaled).fit()

probs = log_reg.predict(X_scaled)
yhat = list(map(round, probs))
res = pd.DataFrame({'p': probs, 'y' : y})
ks = ks_2samp(res.loc[res.y==0,"p"], res.loc[res.y==1,"p"]).statistic
print('KS using StatsModels and all features: ',ks)

In [None]:
#log_reg.summary()

In [None]:
ks_vif_features =  vif_data.merge(ks_df, on = 'feature')
ks_vif_features = ks_vif_features.loc[ks_vif_features.VIF < 3]
ks_vif_features.sort_values(by = 'ks', ascending = False)
ks_vif_features.head(10)

In [None]:
from statsmodels.tools.tools import add_constant
n = 10
#topn = ks_df.feature.to_list()[:n]

for i in range(n):
    topn = ks_vif_features.feature.to_list()[:i+1]
    y = train_df['target']
    X_scaled = train_df_scaled[topn]
    X_scaled = add_constant(X_scaled)
                                         
    log_reg = sm.Logit(y, X_scaled).fit()
    probs = log_reg.predict(X_scaled)
    
    yhat = list(map(round, probs))
    res = pd.DataFrame({'p': probs, 'y' : y})
    ks = ks_2samp(res.loc[res.y==0,"p"], res.loc[res.y==1,"p"]).statistic    
    
    print(f'{i} {topn} ks: {ks}')
    print('VIF: \n',vif_data.loc[vif_data.feature.isin(topn)])

In [None]:
res['log_odds'] = np.log(probs/(1-probs))
res[topn] = train_df[topn]

## Checking for linearity with log-odds 

In [None]:
fig, ax = plt.subplots(nrows=4, ncols=3)
fig.set_figheight(20)
fig.set_figwidth(20)
i = 0
for i, feature in enumerate(topn):
    plt.subplot(4, 3, i+1)
    plt.scatter(x = res['log_odds'], y = res[feature])
    plt.title(feature)

In [None]:

## Plotting multiple plots same figure
fig, (axL, axR) = plt.subplots(2, figsize=(15, 15))
plt.suptitle("Logistic Regression Residual Plots \n using Seaborn Lowess line")


# Deviance Residuals
sns.regplot(log_reg.fittedvalues, log_reg.resid_dev, ax= axL,
            color="black", scatter_kws={"s": 5},
            line_kws={"color":"b", "alpha":1, "lw":2}, lowess=True)

axL.set_title("Deviance Residuals \n against Fitted Values")
axL.set_xlabel("Linear Predictor Values")
axL.set_ylabel("Deviance Residuals")

# Studentized Pearson Residuals
sns.regplot(log_reg.fittedvalues, log_reg.resid_pearson, ax= axR,
            color="black", scatter_kws={"s": 5},
            line_kws={"color":"g", "alpha":1, "lw":2}, lowess=True)

axR.set_title("Studentized Pearson Residuals \n against Fitted Values")
axR.set_xlabel("Linear Predictor Values")
axR.set_ylabel("Studentized Pearson Residuals")

plt.show()

In [None]:

features = train_df.columns.values[2:202]
unique_max_train = []
unique_max_test = []
for feature in features:
    values = train_df[feature].value_counts()
    unique_max_train.append([feature, values.max(), values.idxmax()])
    values = test_df[feature].value_counts()
    unique_max_test.append([feature, values.max(), values.idxmax()])

In [None]:
np.transpose((pd.DataFrame(unique_max_train, columns=['Feature', 'Max duplicates', 'Value'])).\
            sort_values(by = 'Max duplicates', ascending=False).head(15))

In [None]:
np.transpose((pd.DataFrame(unique_max_test, columns=['Feature', 'Max duplicates', 'Value'])).\
            sort_values(by = 'Max duplicates', ascending=False).head(15))

In [None]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df['target']

In [None]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

In [None]:
folds = StratifiedKFold(n_splits=5)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

In [None]:
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:150].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()

**Using only best 100 features to see the performance**

In [None]:
folds = StratifiedKFold(n_splits=5)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][best_features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][best_features], label=target.iloc[val_idx])

    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][best_features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = best_features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[best_features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))