# Setup and imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #'last_expr'

import math, time, datetime, os, pathlib
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv', index_col='id')
df_train.drop(index = df_train.loc[df_train.target < 2].index, inplace=True) # removing obvious 1 outlier

df_target = df_train[['target']]
df_target.shape

df_train = df_train.drop('target', axis=1)
df_train.shape

df_test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv', index_col='id')
df_test.shape

## Helpre functions

In [None]:
def plots_for_dfs(plot_func_to_dfs_list, stack_count = 1) -> None:
    col_count_with_1_stack = len(plot_func_to_dfs_list)
    
    max_features = max([[len(df.columns) for df in func_and_dfs[1]] for func_and_dfs in plot_func_to_dfs_list])[0]
    col_count = col_count_with_1_stack * stack_count
    row_count = math.ceil(max_features/stack_count)

    draw_size_factor = 5
    figsize = (draw_size_factor * col_count, draw_size_factor * row_count)
    print(f'{row_count} x {col_count}')
    fig, axs = plt.subplots(row_count, col_count, figsize=figsize, layout='constrained');
    axs = axs.reshape(-1,col_count_with_1_stack)

    for i, func_and_dfs in enumerate(plot_func_to_dfs_list):
        plot_func = func_and_dfs[0]
        dfs = func_and_dfs[1]
        
        for col_i, col_name in enumerate(dfs[0].columns):
            ax = axs[col_i, i]
            plot_func(ax, dfs, col_name)
            
            title = col_name
            if len(func_and_dfs) == 3:
                title = ', '.join(func_and_dfs[2]) + ' - ' + title
            ax.set_title(title)
            
            if not ax.get_legend():
                if len(dfs) == 1:
                    ax.set_ylabel(col_name)
                elif len(dfs) == 2 and len(func_and_dfs) == 3:
                    ax.set_xlabel(f'{func_and_dfs[2][0]} - {col_name}' )
                    ax.set_ylabel(f'{func_and_dfs[2][1]} - {col_name}' )

# Plot the data

In [None]:
def hist(ax, dfs, col_name):
    for df in dfs:
        ax.hist(df[col_name], density=True, orientation='horizontal', bins=50, alpha = 0.5)
        ax.legend(['train', 'test'])
3
s=0.5
def scatter_x_order_y_df(ax, dfs, col_name):
    ax.scatter(list(range(len(dfs[0]))), dfs[0][col_name], s=s)
    ax.set_xlabel('range(len(df))')

scatter_x_df1_y_df2 = lambda ax, dfs, col_name: ax.scatter(dfs[0][col_name], dfs[1][col_name], s=s)
scatter_x_df_y_df_target = lambda ax, dfs, col_name: ax.scatter(dfs[0][col_name], dfs[1].iloc[:,0], s=s)

# limit the data frames used to render the graphs to use every 10th data point
selector_step = 10
df_train_plot = df_train.iloc[::selector_step]
df_test_plot = df_test.iloc[::selector_step]
df_target_plot = df_target.iloc[::selector_step]

plots_for_dfs( [
    (hist, [df_train_plot, df_test_plot], ['train', 'test']),
    (scatter_x_order_y_df, [df_train_plot], ['train']),
    (scatter_x_order_y_df, [df_test_plot], ['test']),
    (scatter_x_df_y_df_target, [df_train_plot,df_target_plot], ['train', 'target']),
    (scatter_x_df1_y_df2, [df_train_plot.iloc[:len(df_test_plot)], df_test_plot], ['train', 'test']),
])