<img src = "https://www.santanderbank.com/us/documents/22507/2202391/DebitCardPage-Debit-Prmier-Cards_437x336.png/68eaaf59-35ec-44c7-82f8-2cf46a38d733?t=1554058009547" width="400"></img>

# Introduction
Santander always looking for ways to help their customers to understand financial health and identify which products and services might help them achieve their monetary goals.

Problem Statement:
is a customer satisfied? Will a customer buy this product? Can a customer pay this loan?

Dataset contains numeric feature variables, the binary target column, and a string ID_code column.
The task is to predict the value of target column in the test set.

Data: 
1. train.csv - the training set.
2. test.csv - the test set.  
3. sample_submission.csv  

Data reference: (https://www.kaggle.com/c/santander-customer-transaction-prediction/data)

# <a id='0'>Content</a>

- <a href='#1'>1. Read the data</a>
- <a href='#2'>2. Data Understanding</a>
 - <a href='#5'>2.1 Missing values</a>
 - <a href='#6'>2.2 Statistics</a>
- <a href='#3'>3. Data Exploration</a>
 - <a href='#7'>3.1 Distribution of Train vs Test</a>
 - <a href='#8'>3.2 Distribution of Y variable</a>
 - <a href='#9'>3.3 Distribution of X variables</a>
 - <a href='#10'>3.4 Correlation</a>
 - <a href='#11'>3.5 Repeated values</a>
- <a href='#4'>4. Additional Features</a>
- <a href='#5'>5. Model</a>

In [None]:
# Input path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Import necessary libraries

import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve

from sklearn.model_selection import train_test_split

import lightgbm as lgb
import skopt

# <a id='1'>1. Read the data</a>

In [None]:
# Read the data

train = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/train.csv')
test  = pd.read_csv('/kaggle/input/santander-customer-transaction-prediction/test.csv')

In [None]:
print('train shape:', train.shape)
print('test shape:', test.shape)

In [None]:
train.head(2)

In [None]:
test.head(2)

# <a id='2'>2. Data Understanding</a>

### <a id='5'>2.1 Missing values</a>

In [None]:
# Finding missing values in train and test data

def func(df):
    a = df.isnull().sum()
    b = df.count()
    c = (a/b) * 100
    d = pd.DataFrame(a, columns = ['Missingvalue%'])
    return d['Missingvalue%'].sum()

In [None]:
print('missing values in train data:', func(train))
print('missing values in test data:', func(test))

### Summary:
There is no missing value in train and test dataset

### <a id='6'>2.2 Statistics</a>

In [None]:
train.describe()

In [None]:
test.describe()

### Summary: 
min, max, mean and std dev vaues seems to be similar for train and test data

# <a id='3'>3. Data Exploration</a>

### <a id='7'>3.1 Distribution of Train vs Test</a>

#### Let's try to plot Train[variables] vs Test[variables] for few features

In [None]:
features = ['var_0', 'var_1','var_2','var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8', 'var_9',]  

i = 0
fig, ax = plt.subplots(figsize=(12,12))

for feature in features:    
    i = i + 1
    plt.subplot(4,4,i)     
    plt.scatter(train[feature], test[feature])

### <a id='8'>3.2 Distribution of Y variable</a>

In [None]:
# Y distribution in train data

sns.countplot(train.target)

In [None]:
print('% of 1 in train data:', (train.target.value_counts()[1]/train.shape[0]) * 100)

### Summary: 
Data is highly imbalanced, only 10% of 1's in target(Y) column

### <a id='9'>3.3 Distribution of X variables</a>

In [None]:
# function to generate subplots for ('X' variables vs 'Y')

def plot_feature_distribution(df1, df2, label1, label2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(10,10,figsize=(18,22))

    for feature in features:
        i += 1
        plt.subplot(10,10,i)
        sns.distplot(df1[feature], hist=False,label=label1)
        sns.distplot(df2[feature], hist=False,label=label2)
        plt.xlabel(feature, fontsize=9)
        locs, labels = plt.xticks()
        plt.tick_params(axis='x', which='major', labelsize=6, pad=-6)
        plt.tick_params(axis='y', which='major', labelsize=6)
    plt.show();

In [None]:
# Lets plot for few variables.

df1 = train.loc[train['target'] == 0]
df2 = train.loc[train['target'] == 1]
features = train.columns.values[2:102]
plot_feature_distribution(df1, df2, '0', '1', features)

### Summary: 
Most of the variables seems to be normally distributed.

### <a id='10'>3.4 Correlation</a>

#### Relationship between X variables

In [None]:
correlations = train[features].corr().abs().unstack().sort_values(kind="quicksort").reset_index()
correlations.head()

### Summary:
It's clear that, relationship between X variables is very low.

### <a id='11'>3.5 Repeated values</a>

In [None]:
features = train.columns.values[2:202]

unique_max_train = []
for feature in features:
    values = train[feature].value_counts()
    unique_max_train.append([feature, values.max(), values.idxmax()])

In [None]:
np.transpose((pd.DataFrame(unique_max_train, columns=['Feature', 'Count', 'Value'])).\
            sort_values(by = 'Count', ascending=False).head(10))

# <a id='4'>4. Additional Features</a>

#### As we dont have much information on columns, lets try to create aggregated columns

In [None]:
idx = features = train.columns.values[2:202]
for df in [test, train]:
    df['sum'] = df[idx].sum(axis=1)  
    df['min'] = df[idx].min(axis=1)
    df['max'] = df[idx].max(axis=1)
    df['avg'] = df[idx].mean(axis=1)
    df['std'] = df[idx].std(axis=1)     
    df['med'] = df[idx].median(axis=1)

# <a id='5'>5. Model</a>

### LightGBM (Leaf-Wise growth)

https://lightgbm.readthedocs.io/en/latest/Features.html


In [None]:
train.head(2)

In [None]:
# X columns
features = [c for c in train.columns if c not in ['ID_code', 'target']]

# Y volumn
y = train['target']

### Some of the parameters of 'LightGBM'

In [None]:
params = {      'learning_rate': 0.01,
                'max_depth': -1,
                'num_leaves': 12,
                'feature_fraction': 0.1,
                'subsample': 0.2,
                'objective': 'binary',
                 'metric': 'auc',
                 'is_unbalance': True,
                 'bagging_freq': 5,
                 'boosting': 'gbdt' }                 

In [None]:
folds = StratifiedKFold(n_splits = 5, shuffle = False)

oof = np.zeros(len(train))
#predictions = np.zeros(len(test))

for fold_, (idxT, idxV) in enumerate(folds.split(train.values, y.values)):
    print("Fold {}".format(fold_))
    
    X_train = train.iloc[idxT][features]
    y_train = y.iloc[idxT] 
    X_val =   train.iloc[idxV][features] 
    y_val = y.iloc[idxV]
        
    train_data = lgb.Dataset(X_train, y_train)
    val_data   = lgb.Dataset(X_val, y_val)
    
    clf = lgb.train(params =  params ,                    
                    train_set = train_data, 
                    valid_sets = [train_data, val_data], 
                    num_boost_round = 20000,
                    verbose_eval = 1000, 
                    early_stopping_rounds = 5000)
    
    oof[idxV] = clf.predict(X_val, num_iteration=clf.best_iteration)
    
print("CV score: {:<8.5f}".format(roc_auc_score(y, oof)))   

# Conclusion
 Validation accuracy is around 90% and can be further improved with below
 
 1) Tuning Hyperparameters
 
 2) Testing with other models
 
 3) Ensemble of different models