# Fraudulent Transaction Predictions

Test models to predict bank customer fraudulent transactions. This is a classification task typically for imbalanced data where the class of interest is in less proportion.

In [None]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt

### Import data

In [None]:
# Load data
train_df = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')

train_df.head()

The dataframe cosists of 200 features and 1 target. The predictor features are unnamed and the target variable has 2 possible values, 0 or 1. Typically, data for fraudulent transaction predictions is higly unbalanced, with a sizable difference in the number of samples per category. Let us see this in the code below.

### Sample population

Let us see the number of sample per class in the train dataset

In [None]:
def sample_population(y):
    # Unique class values and counts (already sorted)
    mics, counts = np.unique(y, return_counts=True)
    # Samples per class
    samples = {mic: count for mic, count in zip(mics, counts)}
    
    return samples

sample_population(train_df.target)

In categorical problems, the imbalance typically leads to an overffiting and a wrong interpretation of the accuracy metric. In these cases, the accuracy seems almost reach 100%, however, this accuracy is just for the most represented category leaving the less represented category unlearned. There is more than one solution to overcome this sitiation, for instance, by introducing class weights or by doing a virtual oversampling of the less represented category. Here, we just inform to models to work with balanced class weights.

### $K$-best features 

Distributions ans histplots show how the data is distribuited in the space of input variables. For binary classification, thera are two distributions per input variable, this can be seen in one dimesion or two dimensions. However, the features selection made by the SelectKBest() function from sci-kit learn is an univariate linear regression test that analyzes the variables one at the time. 

The method uses the f_classif() function to compute the ANOVA $F$-value for the provided samples. It represents a level of distance between samples of different classes divided by the sum of compactness of each class. For two variables, $x$ and $y$, the formula for the $F$-value is as follows:

$F = (n(\bar{x}_0-\bar{x})^2+m(\bar{x}_1-\bar{x})^2)/ \frac{1}{(n-1)+(m-1)}\sum_{i,j}^{n,m}((x_i-\bar{x}_0)^2 + (x_j-\bar{x}_1)^2)$.

The formula includes the factors $(n-1)$ and $(m-1)$ for the level of compactness that take into account the correction for the true parameter.

Let us include the correlation formula for comparison.

r = $\sum_{i,j}(x_i-\bar{x}))(y_j-\bar{y})/\sigma_i \sigma_j$.

The search for the best predictors must be donde leaving out the test dataframe for validation purposes.

In [None]:
# Define the train and test dataframes
train_df, test_df = train_test_split(train_df, test_size=.25, random_state=1)

print(train_df.shape)
print(test_df.shape)

In [None]:
# function to plot n hisplots
def hist_plots(X, y, n_features):
    
    def single_plot(data_0, data_1, x_labels, ylabel):
        xlabel = x_labels[0]
        fig, axes = plt.subplots(figsize=(6, 4))
        axes.hist(data_0.iloc[:,0], bins=10, density=True, histtype='step')
        axes.hist(data_1.iloc[:,0], bins=10, density=True, histtype='step')
        axes.set(xlabel=xlabel, ylabel=ylabel)
        axes.legend(labels=[0,1])
        plt.tight_layout()
        plt.show()
    
    def double_plot(data_0, data_1, x_labels, ylabel):
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        for i, xlabel in enumerate(x_labels):
            axes[i].hist(data_0.iloc[:,i], bins=10, density=True, histtype='step')
            axes[i].hist(data_1.iloc[:,i], bins=10, density=True, histtype='step')
            axes[i].set(xlabel=xlabel, ylabel=ylabel)
            axes[i].legend(labels=[0,1])
        plt.tight_layout()
        plt.show()
    
    def multi_plot(data_0, data_1, x_labels, ylabel):
        n = len(x_labels)//2
        fig, axes = plt.subplots(n, 2, figsize=(12, 4*n))
        count=0
        for i in range(n):
            for j in range(2):
                xlabel = x_labels[count]
                axes[i][j].hist(data_0.iloc[:,count], bins=10, density=True, histtype='step')
                axes[i][j].hist(data_1.iloc[:,count], bins=10, density=True, histtype='step')
                axes[i][j].set(xlabel=xlabel, ylabel=ylabel)
                axes[i][j].legend(labels=[0,1])
                count += 1
        plt.tight_layout()
        plt.show()
    
    data_0 = X[y==0]
    data_1 = X[y==1]

    ylabel = y.name
    x_labels = X.columns[:n_features]
    
    if len(x_labels)==1:
        single_plot(data_0, data_1, x_labels, ylabel)
        
    elif len(x_labels)==2:
        double_plot(data_0, data_1, x_labels, ylabel)
        
    elif len(x_labels)==3:
        double_plot(data_0, data_1, x_labels[:2], ylabel)
        single_plot(data_0, data_1, x_labels[-1], ylabel)
        
    else:
        multi_plot(data_0, data_1, x_labels, ylabel)
        if len(x_labels)%2!=0:
            single_plot(data_0, data_1, x_labels[-1], ylabel)

In [None]:
# Define the input and target variables
X_train, y_train = train_df.iloc[:,2:], train_df.target
# plot 2 hisplots to see their distributions
hist_plots(X_train, y_train, 2)

The histplots of the first two predictors look much more the same.

Now, select the $k$ best predictors and plot four histograms.

In [None]:
def k_best_transform(X_train, y_train, X_test, y_test, k):
    X_columns = X_train.columns
    k_best = SelectKBest(f_classif, k=k)
    X_train = k_best.fit_transform(X_train, y_train)
    indices = k_best.get_support()
    scores = k_best.scores_
    sel_scores = scores[indices==True]
    sel_columns = X_columns[indices==True]
    X_train = pd.DataFrame(X_train, columns=sel_columns)
    X_test = k_best.transform(X_test)
    X_test = pd.DataFrame(X_test, columns=sel_columns)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    return X_train, y_train, X_test, y_test

# Define features and target for test data
X_test, y_test = test_df.iloc[:,2:], test_df.target

# Transform the train and test datasets based on the train dataset
X_train, y_train, X_test, y_test = k_best_transform(X_train, y_train, X_test, y_test, 100)

In [None]:
hist_plots(X_train, y_train, 4)

After leaving out 100 predictors the variables var3 and var4 are out of the sample.

## Model assesment and model selection

First, define a series of models.

In [None]:
# specify models
models = {'DTC': DecisionTreeClassifier(criterion='entropy', splitter='random', max_depth=5, class_weight='balanced'),
          'RFC': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, class_weight='balanced'),
          'PCN': SGDClassifier(loss='perceptron', eta0=1.0, learning_rate="constant", penalty=None, class_weight='balanced', max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5),
          'SVC': SGDClassifier(loss='hinge', class_weight='balanced', max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5),
          'LRC': LogisticRegression(class_weight='balanced'),
          }

Then, compute the recall of model predictions

In [None]:
# Model assesment
def model_assesment(X_train, y_train, X_test, y_test):
    # Define an empty list to save the test metric of different models
    score_t = []
    trainning_time = []
    for model in models.values():
        # Always scale the input. The most convenient way is to use a pipeline.
        model = make_pipeline(StandardScaler(), model)
        # start time
        starttime = time.time()
        # fit the model
        model.fit(X_train, y_train)
        # end time
        trainning_time.append(time.time() - starttime)
        # make predictions and compute the root mean squared error
        predictions = model.predict(X_test)
        # compute the score over an independent test sample
        score_t.append(recall_score(y_test, predictions))
    return score_t, trainning_time

recall_t, trainning_time = model_assesment(X_train, y_train, X_test, y_test)

for i, model in enumerate(models.keys()):
    print('%s = %.2f,  Time = %.2f' % (model, recall_t[i], trainning_time[i]))

## Results

### $k$-parameter

Let us evaluate the model's quality for different numbers of model predictors and plot the recall for all models and $k$-values.

In [None]:
def k_features_assesment(X_train, y_train, X_test, y_test, k):
    X_train, y_train, X_test, y_test = k_best_transform(X_train, y_train, X_test, y_test, k)
    score_t, _ =  model_assesment(X_train, y_train, X_test, y_test)
    return score_t

def k_scores_plot(data, grid, labels):
    data = np.array(data).transpose()
    fig, ax = plt.subplots(figsize=(6,4))
    for i, label in enumerate(labels):
        ax.scatter(grid, data[i],  marker='^', label=label)
    ax.set_title('')
    ax.set_xlabel('k')
    ax.set_ylabel('Recall')
    plt.legend()
    plt.show()
    
k_grid = [5, 10, 20, 30, 40, 80, 120, 160, 200]
recall_k = []
for k in k_grid:
    X_train, y_train = train_df.iloc[:,2:], train_df.target
    X_test, y_test = test_df.iloc[:,2:], test_df.target
    recall_k.append(k_features_assesment(X_train, y_train, X_test, y_test, k))
    
k_scores_plot(recall_k, k_grid, models.keys())

### Cross-validation

Here, we validate the model using cross-validation.

In [None]:
def cv_testing_models(X, y, models, scoring, n_splits):
    # define model evaluation method (n_splits = 1/test_size)
    cv = RepeatedKFold(n_splits=n_splits, n_repeats=1, random_state=0)
    # evaluate the models
    metric_per_model = []
    for model in models.values():
        model = make_pipeline(StandardScaler(), model)
        metric = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
        metric = np.abs(metric)
        metric_per_model.append(metric)
    return metric_per_model

# load the whole data again
train_df = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')
X_train, y_train = train_df.iloc[:,2:], train_df.target

starttime = time.time()
recall_cv = cv_testing_models(X_train, y_train, models, scoring='recall', n_splits=4)
print('Time: {:0.2f} seconds'.format(time.time() - starttime))

## Test vs cross-validation recall

Let us show the recall obtained from cross-validation for test assessment considering the 200 features in a box plot.

In [None]:
# define a custom box whisker plot
def box_whisker_plot(data, labels, metric_t):
    mean = np.mean(data, axis=1)
    y = np.array(range(1,len(mean)+1))
    fig, ax = plt.subplots(figsize=(6,4))
    ax.set_title('')
    ax.set_xlabel('Recall')
    ax.set_ylabel('Model')
    ax.boxplot(data, labels=labels, vert=False, whis=(0,100))
    ax.scatter(metric_t, y,  marker='^', label='Test recall')
    ax.scatter(mean, y,  marker='^', label='C-V recall')   
    plt.legend()
    plt.show()

box_whisker_plot(recall_cv, models.keys(), recall_k[-1])

As final remarks:

- Cross-validation estimates are in agreement with the test recall.
- The Logistic Regression Classifier performs better in both validation schemes