In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use("fivethirtyeight")

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv").sample(axis=0, n=100000).drop("row_id", axis=1)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
def plot_distributions(data):
    fig, axes = plt.subplots(nrows=16, ncols=5, figsize=(20,112))
    for ind in range(data.shape[1]):
        i, j = ind//5, ind%5
        col = data.iloc[:,ind]
        if col.dtype=="int64":
            sns.countplot(x=col, ax=axes[i,j], palette="GnBu")
        else:
            sns.histplot(x=col, ax=axes[i,j], color="mediumturquoise")
        axes[i,j].set_title(data.columns[ind])
        axes[i,j].set_xlabel(None)
        axes[i,j].set_ylabel(None)
    plt.show()

In [None]:
plot_distributions(data)

In [None]:
categoricals = []
numericals = []
for col in data.columns:
    if data[col].dtype=="int64":
        categoricals.append(col)
    else:
        numericals.append(col)

In [None]:
def scale(cats, nums, data):
    cat_medians = dict()
    cat_extended_iqr = []
    scaled_data = pd.DataFrame()
    for c in cats:
        cat_medians[c] = data[c].median()
        cat_extended_iqr.append(data[c].quantile(0.9) - data[c].quantile(0.1))
    max_med = max(cat_medians.values())
    av_extended_iqr = sum(cat_extended_iqr)/len(cat_extended_iqr)
    for c in cats:
        scaled_data[c] = data[c] - data[c].median() + max_med
        scaled_data[c] = scaled_data[c].astype("int64")
    for c in nums:
        scaled_data[c] = av_extended_iqr*(data[c] - data[c].median())/(data[c].quantile(0.9) - data[c].quantile(0.1)) + max_med
    return scaled_data, cat_medians, cat_extended_iqr, max_med, av_extended_iqr

In [None]:
scaled_data, cat_medians, cat_extended_iqr, max_med, av_extended_iqr = scale(categoricals, numericals, data)

In [None]:
sns.countplot(x=list(cat_medians.values()), palette="GnBu")
plt.title("Distribution of Categorical Medians")
plt.show()

In [None]:
sns.countplot(x=list(cat_extended_iqr), palette="GnBu")
plt.title("Distribution of 10%->90% IQR for Categorical Features")
plt.show()

In [None]:
print(f'Maximum Categorical Median: {max_med}')
print(f"Average 10%->90% IQR for Categorical Features: {av_extended_iqr}")

In [None]:
plot_distributions(scaled_data)

In [None]:
from umap import UMAP

In [None]:
reducer = UMAP(random_state=11, n_neighbors=25)

In [None]:
embedding = reducer.fit_transform(scaled_data.dropna())

In [None]:
plt.figure(figsize=(20,12))
sns.scatterplot(x=embedding[:,0], y=embedding[:,1], color="crimson")
plt.show()