In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Santander Customer Transaction Prediction

## 1. Introduction


This Data Science Project aims to solve the Santander Customer Transaction Prediction Challenge.

In short, this challenge is about identifying which customers will make a specific transaction in the future, irrespective of the amount of money transacted.

The dataset, available on Kaggle's website, presents 200 anonymous variables and 200 thousand records, in both training and test datasets.

With respect to the target variable, the code 1 represents the realization of a transaction, while the code 0 indicates that no transaction was performed. The work is divided into several steps, detailed below.

#### 2. EDA (Exploratory Data Analysis):

In this initial stage, an exploratory analysis of the data is performed, seeking to identify: correlations between variables, outliers, missing values, and characteristics of the probabilistic distribution of variables, among others.
INSIGHTS FROM EDA
#### 3. Validation Strategy:

After the EDA, a study and definition for the dataset validation strategy is performed.

#### 4. Model Training:

A training model is defined, and then a hyperparameter fit is performed on the training data

#### 5. Submission:

In the final stage, a file with the model data is generated for submission to the competition.

#### 6. References:

A list of references which supports the development of this project


## 1. EDA

In [None]:
# Library import
import pandas as pd
import seaborn as sns
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
%matplotlib inline 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import itertools
import sklearn.model_selection
from sklearn.linear_model import LogisticRegression
# Support Vector Machine classification algorithm
from sklearn.svm import SVC
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
# K Nearest Neighbors classification algorithm
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Reading the files
df_train = pd.read_csv("/kaggle/input/dataset/train.csv")
df_test = pd.read_csv("/kaggle/input/dataset/test.csv")

In [None]:
# df_train = df_train.iloc[:10000,:]
# df_test = df_test.iloc[:10000,:]

In [None]:
df_test.shape

In [None]:
df_train.shape

Summary of key statistical measures of the training and test data set

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
X = df_train.drop(['ID_code','target'], axis = 1)
y = df_train['target']

In [None]:
X.shape

Analysis of variable distributions

In [None]:
def plot_feature_distribution(df1, df2, label1, label2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(10,10,figsize=(18,44))

    for feature in features:
        i += 1
        plt.subplot(20,10,i)
        sns.kdeplot(df1[feature],
                    label=label1,color='b')
        sns.kdeplot(df2[feature],
                     label=label2,color='g')
        plt.xlabel(feature, fontsize=9)
        locs, labels = plt.xticks()
        plt.tick_params(axis='x', which='major', labelsize=6, pad=-6)
        plt.tick_params(axis='y', which='major', labelsize=6)
    plt.show();

In [None]:
t0 = df_train.loc[df_train['target'] == 0]
t1 = df_train.loc[df_train['target'] == 1]
features = df_train.columns.values[2:203]
plot_feature_distribution(t0, t1, '0', '1', features)

Correlation matrix of the variables

In [None]:
# Compute a correlation matrix and convert to long-form
corr_mat = df_train.corr().stack().reset_index(name="correlation")

# Draw each cell as a scatter point with varying size and color
g = sns.relplot(
    data=corr_mat,
    x="level_0", y="level_1", hue="correlation", size="correlation",
    palette="vlag", hue_norm=(-1, 1), edgecolor=".7",
    height=10, sizes=(50, 250), size_norm=(-.2, .8),
)

# Tweak the figure to finalize
g.set(xlabel="", ylabel="", aspect="equal")
g.despine(left=True, bottom=True)
g.ax.margins(.02)
for label in g.ax.get_xticklabels():
    label.set_rotation(90)
for artist in g.legend.legendHandles:
    artist.set_edgecolor(".7")

Identifying Null Values

In [None]:
print("The number of missing values in train set = ",df_train.isnull().sum().sum())
print("The number of missing values in test set = ",df_test.isnull().sum().sum())

Distribution of the target variable data

In [None]:
label_0 = df_train.groupby('target').target.count()[0]
label_1 = df_train.groupby('target').target.count()[1]

In [None]:
data = [label_0, label_1]
labels = [0, 1]

#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:5]

#create pie chart
plt.pie(data,labels= labels, colors = colors, autopct='%.0f%%')
plt.show()

Looking at the target variable, the frequency of occurrence of transactions performed (code 1) is only 10%, indicating that the data set is unbalanced.

Data Balancing - SMOTE - Oversampling

In [None]:
print('Original dataset shape %s' % Counter(df_train['target']))

In [None]:
sm = SMOTE(sampling_strategy='minority',k_neighbors=5, random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

In [None]:
#X = np.asarray(X)
#y = np.asarray(y)

## 2. Validation Strategy

Splitting the data set into training and testing (80/20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X_res, y_res, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Normalization of the dataset

In [None]:
#normalização do dataset
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train)
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test)

## 3. Model Training

### Logistic Regression

In [None]:
LR = LogisticRegression(C=0.05, solver='liblinear')
LR.fit(X_train,y_train)
#logist_pred = LR.predict_proba(X_test)[:,1]
#logist_pred

In [None]:
#logist_pred_test = LR.predict_proba(df_test.drop(columns = ['ID_code']))[:,1]

In [None]:
y_train_pred =LR.predict_proba(X_train)
y_test_pred = LR.predict_proba(X_test)[::,1]
y_tpred=LR.predict(X_test)

#calculate AUC of model
auc_LR = metrics.roc_auc_score(y_test, LR.predict_proba(X_test)[::,1])

#print AUC score
print(auc_LR)

In [None]:
f1_lr = f1_score(y_test, y_tpred)
f1_lr

In [None]:
print (classification_report(y_test, LR.predict(X_test)))

In [None]:
print(confusion_matrix(y_test, y_tpred, labels=[1,0]))

In [None]:
# parameters ={"C":[0.01,0.1,1],'penalty':['l2'], 'solver':['lbfgs']}# l1 lasso l2 ridge
#lr=LogisticRegression()
#grid_search = GridSearchCV(lr, parameters, cv=10)
#lr_cv = grid_search.fit(X_train, y_train)

In [None]:
#print("tuned hpyerparameters :(best parameters) ",lr_cv.best_params_)
#print("accuracy :",lr_cv.best_score_, auc_lr)

In [None]:
#yhat=lr_cv.predict(X_test)
#print(confusion_matrix(y_test, yhat, labels=[1,0]))

### Decision Tree

In [None]:
dt = DecisionTreeClassifier(max_depth=None, min_samples_split=2,random_state=0)#
#scores = cross_val_score(dt, X_train, y_train, cv=5)
#scores.mean()

In [None]:
dt.fit(X_train,y_train)
y_train_pred =dt.predict_proba(X_train)
y_test_pred = dt.predict_proba(X_test)[::,1]
y_tpred=dt.predict(X_test)

#calculate AUC of model
auc_dt = metrics.roc_auc_score(y_test, y_test_pred)

#print AUC score
print(auc_dt)

In [None]:
f1_dt = f1_score(y_test, y_tpred)

In [None]:
print (classification_report(y_test, dt.predict(X_test)))
print(confusion_matrix(y_test, y_tpred, labels=[1,0]))

In [None]:
print(confusion_matrix(y_test, y_tpred, labels=[1,0]))

### Random Forest

In [None]:
rfclf = RandomForestClassifier(n_estimators=200, max_depth=10,
...     min_samples_split=2, random_state=0)
#scores = cross_val_score(rfclf, X_train, y_train, cv=5)

#scores.mean()

In [None]:
rfclf.fit(X_train,y_train)
y_train_pred =rfclf.predict_proba(X_train)
y_test_pred = rfclf.predict_proba(X_test)[::,1]
y_tpred=rfclf.predict(X_test)

#calculate AUC of model
auc_rf = metrics.roc_auc_score(y_test, y_test_pred)

#print AUC score
print(auc_rf)

In [None]:
f1_rf = f1_score(y_test, y_tpred)

In [None]:
print (classification_report(y_test, rfclf.predict(X_test)))

In [None]:
print(confusion_matrix(y_test, y_tpred, labels=[1,0]))

### Extra Trees

In [None]:
extclf = ExtraTreesClassifier(n_estimators=200, max_depth=10,
...     min_samples_split=2, random_state=0)

In [None]:
extclf.fit(X_train,y_train)
y_train_pred =extclf.predict_proba(X_train)
y_test_pred = extclf.predict_proba(df_test.drop(columns = ['ID_code']))[::,1]
y_tpred=extclf.predict(X_test)

In [None]:
#calculate AUC of model
auc_ext = metrics.roc_auc_score(y_test, extclf.predict_proba(X_test)[:,1])

#print AUC score
print(auc_ext)

In [None]:
f1_ext = f1_score(y_test, y_tpred)
f1_ext

In [None]:
print (classification_report(y_test, extclf.predict(X_test)))


In [None]:
print(confusion_matrix(y_test, y_tpred, labels=[1,0]))

### Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adbclf = AdaBoostClassifier(n_estimators=50)
#scores = cross_val_score(adbclf, X_train, y_train, cv=5)
#scores.mean()

In [None]:
adbclf.fit(X_train,y_train)
y_train_pred =adbclf.predict_proba(X_train)
y_test_pred = adbclf.predict_proba(X_test)[::,1]
y_tpred=adbclf.predict(X_test)

#calculate AUC of model
auc_adb = metrics.roc_auc_score(y_test, y_test_pred)

#print AUC score
print(auc_adb)

In [None]:
f1_adb = f1_score(y_test, y_tpred)
f1_adb

In [None]:
print (classification_report(y_test, adbclf.predict(X_test)))

In [None]:
print(confusion_matrix(y_test, y_tpred, labels=[1,0]))

### XgBoost

In [None]:
parameters =  {'booster' : ["gbtree"],
               'objective' : ["binary:logistic"],
               'eta':['0.02'],
               #gamma=80,
               'max_depth':['2'],
               'min_child_weight':['1'], 
               'subsample':['0.5'],
               'colsample_bytree':['0.1'],
               #scale_pos_weight = round(sum(!trainY) / sum(trainY), 2)
              }

In [None]:
from xgboost import XGBClassifier
xgb=XGBClassifier()
grid_search = GridSearchCV(xgb, parameters, cv=10)
xgb_cv = grid_search.fit(X_train, y_train)

In [None]:
auc_xgb = metrics.roc_auc_score(y_test, grid_search.predict_proba(X_test)[:,1])

In [None]:
print("tuned hpyerparameters :(best parameters) ",xgb_cv.best_params_)
print("accuracy :",xgb_cv.best_score_, auc_xgb)

In [None]:
xgb.fit(X_train,y_train)
y_train_pred =xgb.predict_proba(X_train)
y_test_pred = xgb.predict_proba(X_test)[::,1]
y_tpred=xgb.predict(X_test)

#calculate AUC of model
auc_xgb = metrics.roc_auc_score(y_test, y_test_pred)
auc_xgb

In [None]:
f1_xgb = f1_score(y_test, y_tpred)
f1_xgb

In [None]:
print (classification_report(y_test, xgb.predict(X_test)))

In [None]:
print(confusion_matrix(y_test, y_tpred, labels=[1,0]))

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
param = {'var_smoothing': np.logspace(0,-15, num=20)}
clf = GridSearchCV(nb,param,cv=10,scoring='average_precision',verbose=3,n_jobs=-1,return_train_score=True)
clf.fit(X_train , y_train)

In [None]:
NB=GaussianNB(var_smoothing=0.16)
NB.fit(X_train,y_train)
y_train_pred = NB.predict_proba(X_train)
y_test_pred = NB.predict_proba(X_test)[::,1]
y_tpred=NB.predict(X_test)


y_pred_proba = LR.predict_proba(X_test)
yhat_prob = LR.predict_proba(X_test)[::,1]

#calculate AUC of model
auc_NB = metrics.roc_auc_score(y_test, y_test_pred)

#print AUC score
print(auc_NB)

In [None]:
f1_NB = f1_score(y_test, y_tpred)

In [None]:
print (classification_report(y_test, NB.predict(X_test)))

In [None]:
print(confusion_matrix(y_test, y_tpred, labels=[1,0]))

### Overall Metrics

In [None]:
overall_metrics = {'Method': ['Logistic Regression', 'Decision Tree','Random Forest','Extra Trees','Ada Boosting','XgBoost',
                              'Naive Bayes'],'AUC':[auc_LR, auc_dt, auc_rf, auc_ext, auc_adb,auc_xgb, auc_NB],
                   'F1 Score':[f1_lr,f1_dt, f1_rf, f1_ext, f1_adb,f1_xgb, f1_NB]}
pd.DataFrame(overall_metrics)

## 5. Submission

Based on Overall Metrics, it was chosen XgBoost to be submited.

In [None]:
MAX_TREE_DEPTH = 8
TREE_METHOD = 'hist'
ITERATIONS = 1000
SUBSAMPLE = 0.6
REGULARIZATION = 0.1
GAMMA = 0.3
POS_WEIGHT = 1.0
EARLY_STOP = 10

In [None]:
parameters = {'tree_method': TREE_METHOD, 'max_depth': MAX_TREE_DEPTH, 'alpha': REGULARIZATION,
          'gamma':GAMMA, 'subsample': SUBSAMPLE, 'learning_rate': 0.05, 
          'silent': [1], 'objective':'binary:logistic', 'eval_metric': ['auc'],
          'n_gpus': [1]}

In [None]:
%%time
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
nfold = 5
skf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=2019)

oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))

target = 'target'
predictors = df_train.columns.values.tolist()[2:]

i = 1
for train_index, valid_index in skf.split(df_train, df_train.target.values):
    print("\nFold {}".format(i))
    xg_train = xgb.DMatrix(df_train.iloc[train_index][predictors].values,
                           df_train.iloc[train_index][target].values,                           
                           )
    xg_valid = xgb.DMatrix(df_train.iloc[valid_index][predictors].values,
                           df_train.iloc[valid_index][target].values,                           
                           )   
    
    clf = xgb.train(parameters, xg_train, ITERATIONS, evals=[(xg_train, "train"), (xg_valid, "eval")],
                early_stopping_rounds=EARLY_STOP, verbose_eval=False)
    oof[valid_index] = clf.predict(xgb.DMatrix(df_train.iloc[valid_index][predictors].values)) 
    
    predictions += clf.predict(xgb.DMatrix(df_test[predictors].values)) / nfold
    i = i + 1


In [None]:
sub_df = pd.DataFrame({"ID_code": df_test.ID_code.values})
sub_df["target"] = predictions
sub_df[:10]

In [None]:
sub_df.to_csv("submission.csv", index=False)

## 6. References

In [None]:
# https://www.kaggle.com/code/sohaibanwaar1203/diminsionality-reduction-and-smote-sampling/notebook
# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
# https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/
# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
# https://github.com/mirianfsilva/santander-customer-transaction-prediction/blob/main/transaction-prediction.ipynb
# https://www.kaggle.com/code/amrmahmoud123/1-guide-to-ensembling-methods
# https://www.kaggle.com/code/danielgrimshaw/sklearn-model-exploration/notebook
# https://deb-sahoo19.medium.com/santander-customer-transaction-prediction-9e0edc8f9baab
# https://www.kaggle.com/code/shirellamosi/logistic-regression-and-gaussian-nb
# https://www.kaggle.com/code/vinhnguyen/accelerating-xgboost-with-gpu