In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt 
import seaborn as sns

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000


path_submissions = '/'
target_name = 'target'
scores_folds = {}

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
DEBUG = False

In [None]:
nrows = 10000 if DEBUG else None

train=pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv", nrows = nrows)
test=pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv", nrows = nrows)

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
colNames=[col for col in test.columns if col not in 'id']

# UMAP / Hdbscan

In [None]:
!mkdir -p /tmp/pip/cache/
!cp ../input/hdbscan0827-whl/hdbscan-0.8.27-cp37-cp37m-linux_x86_64.whl /tmp/pip/cache/
!pip install --no-index --find-links /tmp/pip/cache/ hdbscan

In [None]:
%%time

from sklearn.preprocessing import StandardScaler
import hdbscan
import umap

scaler = StandardScaler()

X = scaler.fit_transform(train[colNames])

reducer = umap.UMAP(random_state=42, n_components=2)
embedding = reducer.fit_transform(X)

clusterer = hdbscan.HDBSCAN(prediction_data=True, min_cluster_size = 250).fit(embedding)

u, counts = np.unique(clusterer.labels_, return_counts=True)

print(u)
print(counts)

# Plot clusters

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=clusterer.labels_, edgecolors='none', cmap='jet');

# Plot relation to target

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=train.target, edgecolors='none', cmap='jet');

# Better scaler

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

X = scaler.fit_transform(train[colNames])

reducer = umap.UMAP(random_state=42, n_components=2)
embedding = reducer.fit_transform(X)
clusterer = hdbscan.HDBSCAN(prediction_data=True, min_cluster_size = 10).fit(embedding)

u, counts = np.unique(clusterer.labels_, return_counts=True)

print(u)
print(counts)

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=clusterer.labels_, edgecolors='none', cmap='jet');

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=train.target, edgecolors='none', cmap='jet');

# Count target by cluster

In [None]:
table_target = pd.crosstab(clusterer.labels_,train.target)

In [None]:
proba = table_target.iloc[:,1] / table_target.sum(axis=1)

# plot probabilities

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=[proba[i] for i in clusterer.labels_], edgecolors='none', cmap='jet',vmin=0.25,vmax=0.75);

# Use on test

In [None]:
Xtest = scaler.transform(test[colNames])
embedding_test = reducer.transform(Xtest)
test_labels, strengths = hdbscan.approximate_predict(clusterer, embedding_test)
test_proba = [proba[i] for i in test_labels]

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding_test[:, 0], embedding_test[:, 1], s=5, c=test_labels, edgecolors='none', cmap='jet');

# Prediction

In [None]:
sub=pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv",nrows = nrows)
sub['target']=test_proba
sub.to_csv("submission.csv",index=False)