In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']

import warnings
warnings.filterwarnings("ignore")
plt.style.use('ggplot')

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv", index_col='row_id')
sample = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv",index_col='row-col')
data.head()

# Quick Missing Values EDA

In [None]:
data.describe().T.style.bar(subset=["mean"],color="#606ff2").background_gradient(
subset=["std"],cmap="PuBu").background_gradient(subset=["50%"],cmap="PuBu")

In [None]:
#check for missing values:
data.isnull().sum()

In [None]:
total_null = data.isnull().sum().sort_values(ascending=False) #First sum and order all null values for each variable
percentage = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False) #Get the percentage
missing_data = pd.concat([total_null, percentage], axis=1, keys=['Total', 'Percentage'])
missing_data.head(56)

In [None]:
features = data.isnull().sum(axis = 0)
features = features[features > 0].index.values
N_targets = len(features)
features

In [None]:
#check for missing values:
plt.figure(figsize=(20,15))
sns.heatmap(data.isnull(),yticklabels=False,cbar=True,cmap='mako')

In [None]:
ncounts = pd.DataFrame([data.isna().mean()]).T
ncounts = ncounts.rename(columns={0: "train_missing"})

ncounts.query("train_missing > 0").plot(kind="barh", figsize=(20, 20), title="% of Values Missing")
plt.show()

# Sklearn Imputation


- `SimpleImputer` Similar to pandas `fillna`
- `IterativeImputer`
- `KNNImputer`

Using SKlearn is good because it provides a `fit` and `transform` method. This allows us to fit on the training and transform on both the train validation.

In real world situations you will want to `fit` and `transform` *within* your cross validation loop to ensure no leakage.

In [None]:
#SimpleImputer
data_simple_impute=data.copy()
for col in features:
    data_simple_impute[col].fillna(data_simple_impute[col].mean(),inplace=True)
data_simple_impute.head()

In [None]:
for i in tqdm(sample.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sample.loc[i, 'value'] = data_simple_impute.loc[row, col]

sample.to_csv('data_simple_impute.csv')

# IterativeImputer

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#from sklearn.ensemble import GradientBoostingRegressor 
from catboost import CatBoostRegressor
imputer = IterativeImputer(estimator=CatBoostRegressor(iterations=500,random_state = 123,task_type='GPU'),
                           missing_values=np.nan,
                           max_iter=1,initial_strategy='mean',
                           imputation_order='ascending',random_state=42, verbose =2)

In [None]:
data_imp_iter = imputer.fit_transform(data[features])
data_imp_iter = pd.DataFrame(data_imp_iter, columns=features)

In [None]:
for i in tqdm(sample.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sample.loc[i, 'value'] = data_imp_iter.loc[row, col]

sample.to_csv('data_pool_imp.csv')

# LGBMImputer

In [None]:
# !rm -r kuma_utils
!git clone https://github.com/analokmaus/kuma_utils.git

In [None]:
import sys
sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

In [None]:
%%time
lgbm_imtr = LGBMImputer(n_iter=150, verbose=True)
data_lgbmimp = lgbm_imtr.fit_transform(data[features])
data_lgbm_imp = pd.DataFrame(data_lgbmimp, columns=features)

In [None]:
#tree_method='gpu_hist'
for i in tqdm(sample.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sample.loc[i, 'value'] = data_lgbm_imp.loc[row, col]

sample.to_csv('data_lgbm_imp.csv')

# Imputation for completing missing values using k-Nearest Neighbors

In [None]:
from sklearn.impute import KNNImputer

knn_imptr = KNNImputer(n_neighbors=3)
print("fiting......")
data_knnimp = knn_imptr.fit_transform(data[['F_1_0']])
data_knnimp_df = pd.DataFrame(data_knnimp, columns=['F_1_0'])

In [None]:
"""for i in tqdm(sample.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sample.loc[i, 'value'] = data_knnimp_df.loc[row, col]

sample.to_csv('data_knnimp_df.csv')"""

# Check The Imputation Distribution

In [None]:
try:
    fig, axs = plt.subplots(2, 2, figsize=(8, 8))
    axs = axs.flatten()
    data_simple_impute["F_1_0"] \
    .plot(kind='hist',
          bins=50,
          ax=axs[0],
          title='Simple Impute',
         color=color_pal[0])
    data_lgbm_imp["F_1_0"] \
    .plot(kind='hist',
          bins=50,
          ax=axs[1],
          title='LGBM Impute',
         color=color_pal[1])
    data_imp_iter["F_1_0"] \
    .plot(kind='hist',
          bins=50,
          ax=axs[2],
          title='IterativeImputer',
         color=color_pal[2])
    data_knnimp_dfdata_knnimp_df["F_1_0"] \
    .plot(kind='hist',
          bins=50,
          ax=axs[3],
          title='KNNImputer',
         color=color_pal[3])
    plt.show()
except:
    print ("An error occurred")
    


### Reference :- https://www.kaggle.com/code/robikscube/handling-with-missing-data-youtube-stream

# work is progress....