In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from cycler import cycler
from tqdm import tqdm

plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])

In [None]:
df=pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv",index_col='row_id')

In [None]:
df.head()

In [None]:
num_features=[col for col in df.columns if df[col].dtypes=='float64']

In [None]:
dis_features=[col for col in df.columns if df[col].dtypes=='int64']

In [None]:
len(dis_features)

## Basic EDA

In [None]:
mis_df = pd.DataFrame({'Feature':df.isna().sum().index[1:],'Missing values':df.isna().sum()[1:]},index=None)

In [None]:
mis_df.head()

In [None]:
len(df)

In [None]:
mis_df['% Missing']=mis_df['Missing values'].apply(lambda s:100*s/len(df))

In [None]:
mis_df.head()

In [None]:
plt.figure(figsize=(8,15))

sns.barplot(data=mis_df[mis_df['Missing values']!=0],x='Missing values',y='Feature')

In [None]:
fig, axs = plt.subplots(11, 5, figsize=(32, 40))
for f, ax in zip(num_features, axs.ravel()):
    ax.hist(df[f], density=True, bins=100)
    ax.set_title(f'Train {f}, std={df[f].std():.1f}')
plt.suptitle('Histograms of the float features', y=0.93, fontsize=20)
plt.show()

In [None]:
# Correlation matrix of the float features
plt.figure(figsize=(30, 20))
sns.heatmap(df[num_features].corr(),center=0, annot=True, fmt='.2f')
plt.show()

In [None]:
#Distribution for the float features
fig, axs = plt.subplots(5, 5, figsize=(32, 32))
for f, ax in zip(dis_features[1:], axs.ravel()):
    ax.hist(df[f], density=True, bins=100)
    ax.set_title(f'Train {f}, std={df[f].std():.1f}')
plt.suptitle('Histograms of the float features', y=0.93, fontsize=20)
plt.show()

In [None]:
len(dis_features)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')

## Iterative Imputer with XGBoost

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,KNNImputer,IterativeImputer
from catboost import CatBoostRegressor
import xgboost

In [None]:
imp = IterativeImputer(
    estimator=xgboost.XGBRegressor(
        n_estimators=150,
        random_state=42,
        tree_method='gpu_hist',
    ),
    missing_values=np.nan,
    max_iter=20,
    initial_strategy='mean',
    imputation_order='ascending',
    verbose=2,
    random_state=42
)

df[:] = imp.fit_transform(df)

In [None]:
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = df.loc[row, col]

submission.to_csv('submission.csv')