In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

plt.rcParams['axes.unicode_minus'] = False
plt.style.use('seaborn') 
sns.set(font_scale=1)  
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv', index_col = 'id')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv', index_col = 'id')

In [None]:
def distributionGraph(train,test):
    FEATURES = test.columns
    full = pd.concat([train[FEATURES], test[FEATURES]], axis=0)

    columns = full.columns.values

    cols = 5
    rows = len(columns) // cols + 1

    fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,65), sharex=False)

    plt.subplots_adjust(hspace = 0.3)
    i=0

    for r in np.arange(0, rows, 1):
        for c in np.arange(0, cols, 1):
            if i >= len(columns):
                axs[r, c].set_visible(False)
            else:
                hist1 = axs[r, c].hist(train[columns[i]].values,
                                       range=(full[columns[i]].min(),
                                              full[columns[i]].max()),
                                       bins=40,
                                       color="deepskyblue",
                                       edgecolor="black",
                                       alpha=0.7,
                                       label="Train Dataset")
                hist2 = axs[r, c].hist(test[columns[i]].values,
                                       range=(full[columns[i]].min(),
                                              full[columns[i]].max()),
                                       bins=40,
                                       color="palevioletred",
                                       edgecolor="black",
                                       alpha=0.7,
                                       label="Test Dataset")
                axs[r, c].set_title(columns[i], fontsize=12, pad=5)
                axs[r, c].set_yticks(axs[r, c].get_yticks())
                axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
                axs[r, c].tick_params(axis="y", labelsize=10)
                axs[r, c].tick_params(axis="x", labelsize=10)
                axs[r, c].grid(axis="y")
                if i == 0:
                    axs[r, c].legend(fontsize=10)

            i+=1

    plt.show();
    

# before scaling

In [None]:
distributionGraph(train,test)

# standard scaling

In [None]:
FEATURES = test.columns
sscaler = StandardScaler()

for col in FEATURES:
    train[col] = sscaler.fit_transform(train[[col]])
    test[col] = sscaler.transform(test[[col]])

# after scaling

In [None]:
distributionGraph(train,test)