In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import time

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

from plotly.offline import init_notebook_mode, iplot, plot
import plotly.express as px
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go

from warnings import simplefilter
simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(12, 6),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)


# Read data

In [None]:
input_path = Path('/kaggle/input/tabular-playground-series-jun-2022/')

data = pd.read_csv(input_path / 'data.csv', index_col='row_id')
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='row-col')

# Missing values

In [None]:
features = list(data.columns)
missing_values_count = pd.DataFrame(data[features].isnull().sum())
missing_values_count.reset_index()
missing_values_count.plot()

In [None]:
missing_values_count.transpose()

* Group F_2 has mo missing values

# Feature distributions

In [None]:
fig, axs = plt.subplots(nrows=16, ncols=5, figsize=(24, 55))
for ax, feature in zip(axs.flat, features):
    sns.histplot(data=data, x=data[feature], ax=ax)
    ax.set_title(feature)
plt.tight_layout()
plt.show() 

# Correlation

In [None]:
corr = data.corr()
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 15))
sns.heatmap(corr, ax=ax)

* Dataset has four feature groups, not correlated. Let's split the dataset into four.

In [None]:
F = [[], [], [], [], []]
for feature in features:
    for i in [1, 2, 3, 4]:
        if feature.split('_')[1] == str(i):
            F[i].append(feature)
df = [[], [], [], [], []]

fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(18, 30))

for i in [1, 2, 3, 4]:
    df[i] = data[F[i]]
    corr = df[i].corr()
    sns.heatmap(corr, ax=axs[i-1], annot=True)

# Imputation

## Use IterativeImputer with whole dataset

In [None]:
start = time.time()
imp = IterativeImputer(max_iter=10, random_state=0)
data[:] = imp.fit_transform(data)
time1 = time.time() - start
print('Whole dataset fit_transform time:', time1)

## Use IterativeImputer with three feature group, exclude F_2 (has no missing values)

In [None]:
start2 = time.time()
for i in [1, 3, 4]:
    start3 = time.time()
    print(i, end=' ')
    imp = IterativeImputer(max_iter=10, random_state=0)
    df[i][:] = imp.fit_transform(df[i])
    print(time.time() - start3)
time2 = time.time() - start2
print('Splitted dataset fit_transform time:', time2, ',', time1/time2, 'faster')    

# Submission

In [None]:
d = pd.concat([df[1], df[2], df[3], df[4]], axis=1)

In [None]:
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = d.loc[row, col]

submission.to_csv('sub_134.csv')

* Public score is the same as with whole dataset processing, but much faster.