In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = '../input/santander-customer-transaction-prediction/train.csv'
df = pd.read_csv(path)

In [None]:
print("Data has {} rows, {} columns".format(df.shape[0], df.shape[1]))

In [None]:
df.head()

In [None]:
print("Data has {} null values".format(df.isnull().any().sum()))

NO NULLS

Distribution of values in the Target column

In [None]:
f, ax = plt.subplots(nrows = 1, ncols = 2)
sns.countplot(x = 'target', data = df, ax = ax[0])
ax[1].pie(x = df.target.value_counts().values, labels = df.target.value_counts().index, autopct = "%.2f%%")
ax[1].set_title("Percentage distribution")
plt.show()

### **The data is heavily biased**

### corr_help : A function to find co-relation of features with a given columns

In [None]:
def corr_help(df, col):
    x = []
    for i in range(200):
        corr = df[col].corr(df['var_'+str(i)])
        x.append(corr)
    return x

In [None]:
x = corr_help(df, 'target')
sns.distplot(x)
plt.show()

In [None]:
y = df['target']
df = df.drop(['ID_code', 'target'], axis = 1)

Preping for removing the outliers

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR1 = Q3-Q1
df_c = df[~((df < (Q1-1.5*IQR1))|(df > (Q3+1.5*IQR1))).any(axis = 1)]

In [None]:
print('Data loss is {}%'.format(((len(y) - len(df_c))/len(y))*100))

Removing the values in target columns which were related to outliers

In [None]:
c = list(df_c.index)
f = []
for i in range(len(c)-1):
    for j in range(c[i]+1, c[i+1]):
        f.append(j)
for i in f:
    y.pop(i)
y = list(y)
df_c['y'] = y

In [None]:
data = df_c

Visualising the distribution of values in diff var (selected randomly)

In [None]:
f, axes = plt.subplots(nrows = 2, ncols = 2)
for i in range(4):
    g = np.random.randint(0, 200)
    sns.distplot(a = data['var_'+str(g)].values - data['var_'+str(g)].values.mean(),ax = axes[i//2][i%2], axlabel = ('var_'+str(g)))
plt.show()

In [None]:
data

# **Machine Learning**

To trade off the imbalance I'll undersample the majority class and oversample (SMOTE) the minority class

In [None]:
y = np.array(data['y'].values, dtype = int)
X = np.array(data.drop('y', axis = 1).values, dtype = float)

Scaling the values

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X1 = scaler.fit_transform(X)

Using SMOTE to oversample target = 1 data points and then RandomUnderSampling the obtained data such that the ratio of target = 1/ target = 0 data points is 1/2

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
smote = SMOTE(sampling_strategy = 3/7, k_neighbors = 5, random_state = 9)
under = RandomUnderSampler(sampling_strategy = 0.5)
X1, y = smote.fit_resample(X1, y)
X1, y = under.fit_resample(X1, y)

In [None]:
print('1\t', len(y[y==1])/len(y), '% \n0\t', len(y[y==0])/len(y), '%')

Split the data into train and validation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X1, y, test_size = 0.2)

First applying Logistic Regression(LR). LR will form the base algorithm and every other algorithm's performace will be compared to it.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
params = {'C' : [0.0001, 0.0003, 0.0005, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5]}
clf = GridSearchCV(LogisticRegression(), params)
clf.fit(X_train, y_train)

In [None]:
(clf.score(X_val, y_val))

In [None]:
pred_prob = clf.predict_proba(X_val)

In a Classification problem accuracy is not a good measure of performance. Thus using AUC-ROC to compare the performance.

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
def auc_roc_plot(x, y, l, colors):
    #roc curve for clf
    fpr, tpr, thresh = roc_curve(y_val, pred_prob[:, 1], pos_label = 1)

    # for fpr = tpr
    random_probs = [0 for i in range(len(y_val))]
    p_fpr, p_tpr, _ = roc_curve(y_val, random_probs, pos_label=1)

    #auc
    auc_score = roc_auc_score(y_val, pred_prob[:, 1])

    #plot
    plt.plot(fpr, tpr, linestyle = '--', color = 'orange', label = "LogisticRegression")
    for i in range(len(x)):
        plt.plot(x[i], y[i], linestyle = '--', color = colors[i], label = l[i])
    plt.plot(p_fpr, p_tpr, linestyle = '--', color = 'blue')
    plt.title("ROC curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc = 'best')
    plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
fprs = []
tprs = []
labels = []
colors = []

Applying RandomForestClassifier

In [None]:
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)
clf_rf_pp = clf_rf.predict_proba(X_val)
clf_rf_fpr, clf_rf_tpr, _ = roc_curve(y_val, clf_rf_pp[:, 1], pos_label = 1)
fprs.append(clf_rf_fpr)
tprs.append(clf_rf_tpr)
labels.append('Random Forest')
colors.append('r')

Applying DecisionTreeClassifier

In [None]:
clf_dt = DecisionTreeClassifier()
clf_dt.fit(X_train, y_train)
clf_dt_pp = clf_dt.predict_proba(X_val)
clf_dt_fpr, clf_dt_tpr, _ = roc_curve(y_val, clf_dt_pp[:, 1], pos_label = 1)
fprs.append(clf_dt_fpr)
tprs.append(clf_dt_tpr)
labels.append('Decision Tree')
colors.append('c')

Applying XGBoost Classifier

In [None]:
clf_xgb = XGBClassifier()
clf_xgb.fit(X_train, y_train)
clf_xgb_pp = clf_xgb.predict_proba(X_val)
clf_xgb_fpr, clf_xgb_tpr, _ = roc_curve(y_val, clf_xgb_pp[:, 1], pos_label = 1)
fprs.append(clf_xgb_fpr)
tprs.append(clf_xgb_tpr)
labels.append('XGB')
colors.append('m')

Visualising the AUC-ROC plots

In [None]:
auc_roc_plot(fprs, tprs, labels, colors)

It is evident that the RandomForestClassifier does a good job followed by XGBClassifier then LogisticRegression (which was the baseline for comparison)

In [None]:
clf_rf.score(X_val, y_val)

In [None]:
path_test = '../input/santander-customer-transaction-prediction/test.csv'
test = pd.read_csv(path_test)

In [None]:
test.isna().any().sum()

Amazing to have no nulls

In [None]:
ids = test['ID_code'].values
X_test = test.drop('ID_code', axis = 1).values

In [None]:
X_test

In [None]:
X_test = scaler.transform(X_test)

In [None]:
y_preds = clf_rf.predict(X_test)

In [None]:
y_preds = np.array(y_preds)

In [None]:
len(y_preds[y_preds == 0])/len(y_preds)

In [None]:
submission = pd.DataFrame({'ID_code':ids,
                          'target':y_preds})
submission.to_csv('cust.csv',index=False)

Using ML (RandomForestClassifier) the score obtained on submission is about 0.5.

Applying Deep Learning

In [None]:
from keras.models import Sequential
from keras.layers import Dense
model = Sequential([Dense(256, activation = 'relu', input_dim = X_train.shape[1]),
           Dense(256, activation = 'relu'),
           Dense(512, activation = 'relu'),
           Dense(512, activation = 'relu'),
           Dense(1024, activation = 'relu'),
           Dense(1024, activation = 'relu'),
           Dense(1024, activation = 'relu'),
           Dense(1024, activation = 'relu'),
           Dense(512, activation = 'relu'),
           Dense(512, activation = 'relu'),
           Dense(256, activation = 'relu'),
           Dense(256, activation = 'relu'),
           Dense(1, activation = 'sigmoid')
            ]
          )
model.compile(optimizer = 'adam', loss = 'BinaryCrossentropy', metrics = 'AUC')

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs = 20, validation_data = (X_val, y_val))

In [None]:
preds_dl = model.predict(X_test)

In [None]:
preds_dl = [1 if i > 0.5 else 0 for i in preds_dl]

In [None]:
preds_dl = np.array(preds_dl)
len(preds_dl[preds_dl == 1])/len(preds_dl)

In [None]:
submission = pd.DataFrame({'ID_code':ids,
                          'target':preds_dl})
submission.to_csv('cust1.csv',index=False)

Score obtained via DL is 0.65 (significant improvement)