# Santander Customer Transaction Prediction


# Loading the data

In [None]:
import math
from pathlib import Path

import numpy as np
import pandas as pd

In [None]:
DATA_DIR = Path('/kaggle/input')

# list all file paths in DATA_DIR and its subdirectories
for filepath in DATA_DIR.rglob('*'):
     print(filepath)

In [None]:
data_filepath = DATA_DIR / 'santander-customer-transaction-prediction'

train_data = pd.read_csv(data_filepath / 'train.csv', index_col='ID_code')

In [None]:
train_data.head()

In [None]:
RANDOM_STATE = 24
TARGET = 'target'

# Check for missing values

In [None]:
train_data.info()

In [None]:
# utility function
def missing_value_stats(dataframe):
    count = dataframe.isna().sum().sort_values(ascending=False)
    proportion = count / dataframe.shape[0]

    missing_value_df = pd.concat([count, proportion], axis=1)
    missing_value_df.columns = ['missing values', 'proportion']
    return missing_value_df

In [None]:
train_data_missing = missing_value_stats(train_data)
train_data_missing[train_data_missing.proportion > 0]

# Visualizing the distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (14,8)
sns.set_theme(style='whitegrid')

In [None]:
columns = train_data.columns
cols = 2
rows = math.ceil(len(columns)/cols)
fig, axes = plt.subplots(rows, cols, figsize=(14, 8//cols*rows))
plt.tight_layout()

for i, col in enumerate(columns):
    ax = axes[i//cols, i%cols]
    sns.histplot(data=train_data, x=col, hue=TARGET, element='step', ax=ax)
    ax.set_title(f'Histogram of {col}', y=0.88)

plt.show()

# One-way ANOVA

In [None]:
from scipy import stats

In [None]:
features = train_data.columns[1:]

anova_results = []
for col in features:
    groupby = train_data.groupby(TARGET)[col]
    categories = train_data[TARGET].dropna().unique()
    anova_data = [
        groupby.get_group(category) for category in categories
    ]

    F, p = stats.f_oneway(*anova_data)
    anova_results.append([col, F, p])

columns = ['feature', 'F-statistic', 'p-value']
anova_df = pd.DataFrame(anova_results, columns=columns)
anova_df = anova_df.sort_values('p-value').set_index('feature')
anova_df

In [None]:
# get the columns whose p-value is statistically significant
threshold = 0.01
significant_anova = anova_df[anova_df['p-value'] < threshold]

print("Features with significant ANOVA p-value: {}".format(
    significant_anova.shape[0]
))
print("Features with insignificant ANOVA p-value: {}".format(
    anova_df.shape[0] - significant_anova.shape[0]
))

In [None]:
display(significant_anova.head())
display(significant_anova.tail())

# Mutual information

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
X = train_data.copy()
y = X.pop(TARGET)

In [None]:
mi_scores = mutual_info_classif(X, y, random_state=RANDOM_STATE)
mutual_info = pd.Series(mi_scores, index=X.columns,
                        name='mutual_info')
mutual_info = mutual_info.sort_values(ascending=False)
nonzero_mutual_info = mutual_info[mutual_info != 0]

print('Number of features with non-zero MI score: {}'.format(
    nonzero_mutual_info.shape[0]
))
print('Number of features with zero MI score: {}'.format(
    mutual_info.shape[0] - nonzero_mutual_info.shape[0]
))

In [None]:
print(f'Top 5: \n{nonzero_mutual_info.head()}\n')
print(f'Bottom 5: \n{nonzero_mutual_info.tail()}\n')

In [None]:
fig, ax = plt.subplots()
data = nonzero_mutual_info.head(25)

sns.barplot(x=data.values, y=data.index, ax=ax)
ax.set_title('Mutual information scores')
plt.show()

In [None]:
list(nonzero_mutual_info.head(25).index)

In [None]:
list_a = list(set(significant_anova.index) - set(nonzero_mutual_info.index))
print(f'Features with significant ANOVA p-value but zero mutual info: {len(list_a)}')

list_b = list(set(nonzero_mutual_info.index) - set(significant_anova.index))
print(f'Features with non-zero mutual info but insignificant ANOVA p-value: {len(list_b)}')

In [None]:
fig, ax = plt.subplots()
data = nonzero_mutual_info[list_b].sort_values(ascending=False)

sns.barplot(x=data.values, y=data.index, ax=ax)
ax.set_title('Mutual information scores')
plt.show()