# About this notebook

This is a Prediction competition. So differently from Code competitions, we have access to all test data and it is important to analyse and spot drifts in variables behaviours and distributions in public and private sets.

This has been discussed on many topics in the forum and here I'd like to provide a simple code snippet to plot these two subsets.

Many thanks to [@raddar](https://www.kaggle.com/raddar) for publishing such a great [dataset](https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format).

# Imports/Read

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:
df = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')
df['S_2'] = pd.to_datetime(df['S_2']).astype('datetime64[ns]')

print(df.shape)

In [None]:
df = df.groupby('customer_ID').tail(1).reset_index(drop=True)
print(df.shape)
display(df.head())

# Dates

In [None]:
df.S_2.hist(figsize=(12,6));

# Split Public x Private

In [None]:
public = df[df['S_2'] < '2019-07-01'].reset_index()
public = public.drop('S_2', axis=1)
print(public.shape)

private = df[df['S_2'] > '2019-07-01'].reset_index()
private = private.drop('S_2', axis=1)
print(private.shape)

# Histograms

In [None]:
cols = [col for col in df.columns if col not in ['customer_ID', 'S_2']]
len(cols)

In [None]:
begin = 100
end = 120

n_vars_to_plot = len(cols[begin:end])
row = 0

fig, axes = plt.subplots(n_vars_to_plot, 2, figsize = (14, n_vars_to_plot * 3))

for var in tqdm(cols[begin:end]):

  axes[row, 0].set_title(f'Public: {var}', color='blue')
  sns.histplot(data=public, x=var, ax=axes[row, 0], bins = 20)

  axes[row, 1].set_title(f'Private: {var}', color='red')
  sns.histplot(data=private, x=var, ax=axes[row, 1], bins = 20)

  row = row + 1

plt.tight_layout()
plt.show()

# Categorical features

In [None]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
len(cat_cols)

In [None]:
begin = 0
end = len(cat_cols)

n_vars_to_plot = len(cat_cols[begin:end])
row = 0

fig, axes = plt.subplots(n_vars_to_plot, 2, figsize = (14, n_vars_to_plot * 3))

for var in tqdm(cat_cols[begin:end]):

  axes[row, 0].set_title(f'Public: {var}', color='blue')
  sns.histplot(data=public, x=var, ax=axes[row, 0], bins = 20)

  axes[row, 1].set_title(f'Private: {var}', color='red')
  sns.histplot(data=private, x=var, ax=axes[row, 1], bins = 20)

  row = row + 1

plt.tight_layout()
plt.show()

# NaN

In [None]:
na = pd.DataFrame({'features' : cols})

for i in range(len(na.features)):
    var_name = na.loc[i, 'features']
    na.loc[i, 'public'] = 100 * public[var_name].isna().sum()/len(public)
    na.loc[i, 'private'] = 100 * private[var_name].isna().sum()/len(private)
    
na = na.melt(id_vars = ['features'])
na    

In [None]:
begin = 100
end = 120

n_vars_to_plot = len(cols[begin:end])

fig, axes = plt.subplots(n_vars_to_plot, 1, figsize = (14, n_vars_to_plot * 3))

row = 0

for i in tqdm(cols[begin:end]):
    
  s = na[na.features == i]
    
  axes[row].set_title(i, color='blue')
  sns.pointplot(data = s, x='variable', y='value', ax=axes[row])
  axes[row].set_xlabel('')
  axes[row].set(ylim = (-1,101))
  axes[row].set_ylabel('Percentage of NaN')

  row = row + 1


plt.tight_layout()
plt.show()