# UMAP / Hdbscan exploration of Data

In [None]:
!mkdir -p /tmp/pip/cache/
!cp ../input/hdbscan0827-whl/hdbscan-0.8.27-cp37-cp37m-linux_x86_64.whl /tmp/pip/cache/
!pip install --no-index --find-links /tmp/pip/cache/ hdbscan

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
import umap
import hdbscan
import matplotlib.pyplot as plt

DEBUG = True

In [None]:
%%time

train_data = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet')
train_data = train_data.drop_duplicates(subset=['customer_ID'], keep='last')

if DEBUG:
    train_data = train_data.sample(frac=0.2)
    
date_min = train_data.S_2.min()   
train_data.S_2 = (pd.to_datetime(train_data.S_2) - pd.to_datetime(date_min)).dt.days

In [None]:
%%time

test_data = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')
test_data = test_data.drop_duplicates(subset=['customer_ID'], keep='last')

if DEBUG:
    test_data = test_data.sample(frac=0.2)
    
date_min = train_data.S_2.min()   
test_data.S_2 = (pd.to_datetime(test_data.S_2) - pd.to_datetime(date_min)).dt.days

In [None]:
col_nums = train_data.columns[train_data.dtypes!='object']
col_cat = [c for c in train_data.columns if c not in col_nums]

In [None]:
%%time

scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train_data[col_nums]),columns=col_nums).fillna(0)
test_scaled = pd.DataFrame(scaler.transform(test_data[col_nums]),columns=col_nums).fillna(0)

In [None]:
%%time

reducer = umap.UMAP(random_state=42)
embedding_train = reducer.fit_transform(train_scaled)
embedding_test = reducer.transform(test_scaled)

In [None]:
%%time

clusterer_train = hdbscan.HDBSCAN(prediction_data=True, min_cluster_size = 200 if DEBUG else 1000).fit(embedding_train)
u_train, counts_train = np.unique(clusterer_train.labels_, return_counts=True)

clusterer_test = hdbscan.HDBSCAN(prediction_data=True, min_cluster_size = 200 if DEBUG else 1000).fit(embedding_test)
u_test, counts_test = np.unique(clusterer_test.labels_, return_counts=True)

print(u_train)
print(counts_train)

print(u_test)
print(counts_test)

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding_train[:, 0], embedding_train[:, 1], s=5, c=clusterer_train.labels_, edgecolors='none', cmap='jet');

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding_test[:, 0], embedding_test[:, 1], s=5, c=clusterer_test.labels_, edgecolors='none', cmap='jet');

Colors are changing (hdbscan give another order of clusters) but it seems that data distribution doesn't change that much between train and test. 

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding_train[:, 0], embedding_train[:, 1], s=5, c=train_data.target, edgecolors='none', cmap='jet');
plt.colorbar();

Default rate by cluster

In [None]:
DR_by_cluster = pd.DataFrame({'cluster':clusterer_train.labels_, 'target':train_data.target}).groupby('cluster').mean()
DR_map = np.array([DR_by_cluster.loc[c].values[0] for c in clusterer_train.labels_])

plt.figure(figsize=(10, 8))
plt.scatter(embedding_train[:, 0], embedding_train[:, 1], s=5, c=DR_map, edgecolors='none', cmap='jet');
plt.colorbar();

# Numerical

Main feature negatively correlate with default:

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding_train[:, 0], embedding_train[:, 1], s=5, c=train_data['P_2'], edgecolors='none', cmap='jet'); #better coloring for integers ?
plt.colorbar();
plt.show();

color by last date in test... relatively uniform repartition.

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding_test[:, 0], embedding_test[:, 1], s=5, c=test_data['S_2'], edgecolors='none', cmap='jet'); #better coloring for integers ?
plt.colorbar();
plt.show();

# Categoricals

In [None]:
col_cat = ['D_63',
 'D_64',
 'D_66',
 'D_68',
 'B_30',
 'B_38',
 'D_114',
 'D_116',
 'D_117',
 'D_120',
 'D_126']

In [None]:
for c in col_cat:
    print(c)
    plt.figure(figsize=(10, 8))
    train_data[c].cat.categories = np.arange(len(train_data[c].cat.categories))
    plt.scatter(embedding_train[:, 0], embedding_train[:, 1], s=5, c=train_data[c], edgecolors='none', cmap='jet'); #better coloring for integers ?
    plt.colorbar();
    plt.show();