In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [None]:
train_data = pd.read_csv(r"/kaggle/input/santander-customer-transaction-prediction/train.csv")
test_data = pd.read_csv(r"/kaggle/input/santander-customer-transaction-prediction/test.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

### 1. Summary Statistics

In [None]:
print("shape of training set is " + str(train_data.shape))
print("shape of test set is " + str(test_data.shape))

In [None]:
train_data.describe()

In [None]:
test_data.describe()

The summary statistics for the training and test data are similar

### 2. Check null values

In [None]:
missing_data_train = train_data.isna().sum()
missing_data_train[missing_data_train != 0]

In [None]:
missing_data_test = train_data.isna().sum()
missing_data_test[missing_data_train != 0]

There are no missing data in both the training and test set

### 3. Check categorical values

In [None]:
### https://stackoverflow.com/questions/29803093/check-which-columns-in-dataframe-are-categorical 
train_data.drop(columns=['ID_code', 'target']).select_dtypes(include=['category', 'object', 'int'])

In [None]:
test_data.select_dtypes(include=['category', 'object', 'int'])

There are no categorical data in this dataset

### 4. Distribution of target variable

In [None]:
### http://seaborn.pydata.org/tutorial/categorical.html?highlight=bar%20plot
sns.countplot(data=train_data, x='target')

##### The dataset is imbalanced. Since there are plenty of data available, we will undersample data with target class 0 before training

### 5. Plots

In [None]:
# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_train_X = pca.fit_transform(train_data.drop(columns = ['target', 'ID_code']))

In [None]:
pca_train_X.shape

In [None]:
from matplotlib import pyplot as plt

plt.scatter(pca_train_X[:,0], pca_train_X[:,1], c=train_data['target'])
plt.show()

We cannot see any pattern from the plot. Try violin plot now.

In [None]:
import math

## Violin plot for training data
num_features=5 
num_columns = 3

columns = list(filter(lambda col: col not in ['ID_code'] + ['target'], train_data.columns))
num_graphs = math.ceil(len(columns)/num_features)
num_rows = math.ceil(num_graphs/num_columns)

for row in range(num_rows):
    print("plotting for variables %d - %d" % (row * num_features * num_columns, row * num_features * num_columns+15))
    fig, axes = plt.subplots(1, num_columns)
    fig.set_figheight(5)
    fig.set_figwidth(8 * num_columns)
    for col in range(num_columns):
        curr_cols = columns[row * num_features * num_columns + col * num_features:row * num_features * num_columns + col * num_features + num_features]
        if len(curr_cols) == 0:
            break
        else:
            curr_cols = curr_cols + ['target']
        df = train_data[curr_cols]
        df = df.melt(id_vars = ['target'], var_name = 'Vars', value_name = 'Values')
        sns.violinplot(x="Vars",y="Values",data=df, hue = 'target', split=True, inner="quart", ax=axes[col])
    plt.show()

In [None]:
# Violin plot for train vs test dataset
copy_train_data = train_data.copy()
copy_test_data = test_data.copy()

copy_train_data['dataset'] = 'train'
copy_test_data['dataset'] = 'test'

copy_train_data = copy_train_data.drop(columns=['target'])

columns = list(filter(lambda col: col not in ['ID_code'] + ['dataset'], copy_train_data.columns))

num_graphs = math.ceil(len(columns)/num_features)
num_rows = math.ceil(num_graphs/num_columns)

for row in range(num_rows):
    print("plotting for variables %d - %d" % (row * num_features * num_columns, row * num_features * num_columns+15))
    fig, axes = plt.subplots(1, num_columns)
    fig.set_figheight(5)
    fig.set_figwidth(8 * num_columns)
    for col in range(num_columns):
        curr_cols = columns[row * num_features * num_columns + col * num_features:row * num_features * num_columns + col * num_features + num_features]
        if len(curr_cols) == 0:
            break
        else:
            curr_cols = curr_cols + ['dataset']
        train_df = copy_train_data[curr_cols]
        test_df = copy_test_data[curr_cols]
        df = train_df.append(test_df)
        df = df.melt(id_vars = ['dataset'], var_name = 'Vars', value_name = 'Values')
        sns.violinplot(x="Vars",y="Values",data=df, hue = 'dataset', split=True, inner="quart", ax=axes[col])
    plt.show()

The violin plots look identical for training and test dataset. We expect that a model performing well for training set should perform well for test set as well.

In [None]:
## plot the histogram of corr coef across all variables

# this function calculates the correlation coefficient for large dataset
def corr_coef(df):
    mean = np.mean(df, axis=0)
    std = np.std(df, axis=0)
    scaled = (df-mean)/std
    return np.matmul(scaled.T, scaled)/df.shape[0]

def plot_corrcoef(mat):
    res = []
    for i in range(len(mat-1)):
        for j in range(i+1,len(mat)):
            res.append(mat[i,j])
    
    plt.hist(res, bins=100)
    plt.show()

In [None]:
## training set
corr = corr_coef(train_data.drop(columns=['target', 'ID_code']))
plot_corrcoef(corr.to_numpy())

In [None]:
## test set
test_corr = corr_coef(test_data.drop(columns=['ID_code']))
plot_corrcoef(test_corr.to_numpy())

The variables exhibits little or no linear correlation with each other.