In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv")

In [None]:
data.info(max_cols=10)

In [None]:
data.head()

In [None]:
sample_submission.head()

In [None]:
cat_cols = data.nunique()[data.nunique()<25].index
num_cols = list(data.nunique()[data.nunique()>25].index)
num_cols.remove("row_id")
print(f"categorical columns count: {len(cat_cols)}, numerical columns count:{len(num_cols)},row id count:1, total columns count:{len(data.columns)}")

In [None]:
for col in num_cols:
    data[col] = data[col].astype(np.float32)
for col in cat_cols:
    data[col] = data[col].astype(np.int16)


In [None]:
print(f"total null values count: {data.isnull().sum().sum()}, total null categorical values count: {data.isnull().sum().loc[cat_cols].sum()}, total null numerical values count: {data.isnull().sum()[num_cols].sum()}")

In [None]:
display(pd.DataFrame(data.isnull().sum()[data.isnull().sum()>0].index.to_numpy().reshape(5,11)))
pd.DataFrame(data.isnull().sum()[data.isnull().sum()>0].values.reshape(5,11))

In [None]:
plt.figure(figsize=(15,5))
data.isnull().sum()[data.isnull().sum()>0].plot(kind="bar")
plt.title("null value counts");

In [None]:
def show_na(df):
    figsize=(12,6)
    plt.figure(figsize=figsize)
    sns.heatmap(df[num_cols].isnull(),yticklabels=False,cbar=False,cmap='viridis')
    plt.show()
    print(df.isnull().sum())
show_na(data)

In [None]:
data[num_cols].describe().T.sort_values("mean").style.bar(subset=["mean"]
                                                         ).background_gradient(
                                                                                subset=["mean"],cmap="viridis"
                                                            ).background_gradient(
                                                                                    subset=["std"],cmap="viridis"
                                                                                 )

## Distribution Plots

In [None]:
fig,axs = plt.subplots(ncols=7,nrows=8,figsize=(10,10))
plt.subplots_adjust(wspace=0.5,hspace=0.5)
for col,ax in zip(num_cols,axs.flat):
    data[col].plot(kind="hist",density=True,ax=ax,bins=30)
    ax.set_title(col)
fig.suptitle("dist plots of numerical columns")
fig.tight_layout()

## Line Plots

In [None]:

fig,axs = plt.subplots(ncols=7,nrows=8,figsize=(10,10))
plt.subplots_adjust(wspace=0.5,hspace=0.5)
for col,ax in zip(num_cols,axs.flat):
    data[col][0:100].plot(kind="line",ax=ax)
    ax.set_title(col)
fig.suptitle("line plots of numerical columns")
fig.tight_layout()


In [None]:
corr = data[num_cols].corr()
mask=np.triu(corr)
plt.figure(figsize = (10,10))
sns.heatmap(corr, annot=False, mask=mask,cbar=True,cbar_kws = dict(use_gridspec=False,location="top"))
plt.tight_layout()

## NEXT Sklearn Imputation
* ref:  https://scikit-learn.org/stable/auto_examples/impute/plot_missing_values.html#sphx-glr-auto-examples-impute-plot-missing-values-py
* imputation by the constant value 0
* imputation by the mean value
* simple imputation
* k nearest neighbor imputation
* iterative imputation

In [None]:
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor, BaggingRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

In [None]:
data2 = data.copy()

In [None]:
data2.isnull().sum()[data2.isnull().sum()>0].sort_values(ascending=False).tail(10)

In [None]:
col="F_3_7" 

In [None]:
idx=np.random.choice(np.arange(len(data2)),size=300)

In [None]:
X_missing = data2.loc[idx].drop(col,axis=1) 
y = data2[col].loc[idx].fillna(data2[col].loc[idx].mean())

In [None]:
import warnings 
warnings.filterwarnings("ignore")

In [None]:
X_missing.isnull().sum().sum(),y.isnull().sum().sum()

In [None]:
def get_scores_for_imputer(imputer, X_missing, y):
    estimator = make_pipeline(imputer, regressor)
    impute_scores = cross_val_score(
        estimator, X_missing, y, scoring="neg_root_mean_squared_error", cv=N_SPLITS
    )
    return impute_scores
import time

In [None]:
N_SPLITS = 4
regressor =  ExtraTreesRegressor(n_estimators=100)


imputers = {
        
        "Mean_imputer" : SimpleImputer(missing_values=np.nan, strategy="mean"),
        "KNN_imputer" : KNNImputer(missing_values=np.nan,n_neighbors=10,
                                   weights='uniform'),
        "Iterative_imputer" : IterativeImputer(missing_values=np.nan,
                                    random_state=0, n_nearest_features=10,
                                               max_iter=10, ),
        "Iterative_imputer2" : IterativeImputer(estimator = regressor,missing_values=np.nan,
                                    random_state=123, 
                                               max_iter=10,),
# credits and thanks: 
# next imputer from @hiro5299834 's work
# https://www.kaggle.com/code/hiro5299834/tps-jun-2022-iterativeimputer-baseline
    "Iterative_imputer3" : IterativeImputer(
                                            estimator=regressor,
                                            missing_values=np.nan,
                                            max_iter=10,
                                            initial_strategy='mean',
                                            imputation_order='ascending',
                                            random_state=42
                                        ),
    "Iterative_imputer4" : IterativeImputer(
                                            estimator=regressor,
                                            missing_values=np.nan,
                                            max_iter=10,
                                            initial_strategy='mean',
                                            imputation_order='descending',
                                            random_state=42
                                        ),
    "Iterative_imputer5" : IterativeImputer(
                                            estimator=regressor,
                                            missing_values=np.nan,
                                            max_iter=10,
                                            initial_strategy='mean',
                                            imputation_order='roman',
                                            random_state=42
                                        ),
        

}

imputer_count = len(imputers)                                                                                                             
mses_xtrain = np.zeros(imputer_count)
stds_xtrain = np.zeros(imputer_count)
runtime = np.zeros(imputer_count)
for i,value in enumerate(list(imputers.values())):
        imputer = value
        t1 = time.time()
        scores = get_scores_for_imputer(imputer, X_missing, y)
        mses_xtrain[i],stds_xtrain[i],runtime[i] = scores.mean(), scores.std(),time.time()-t1
        print(f"{list(imputers.keys())[i]} done.")
mses_xtrain = mses_xtrain * -1

In [None]:
n_bars = len(mses_xtrain)
xval = np.arange(n_bars)
colors = ["darkblue", "green", "red", "violet", "black","blue","cyan","darkgreen"]

# plot diabetes results
plt.figure(figsize=(6, 6))
for j in xval:
    plt.barh(
        j,
        mses_xtrain[j],
        0.3,
        xerr=stds_xtrain[j],
        color=colors[j],
        alpha=0.6,
        align="center",
    )
    plt.barh(j + 0.2, 
             (runtime[j]-runtime.min())/(runtime.max()-runtime.min()),
             0.2,
             label="test time", 
             color="darkorange")
plt.vlines(x=mses_xtrain.min(),ymin=0,ymax=n_bars,color="red",linestyles="--",linewidth=0.7)
plt.title("Imputation Techniques")
plt.xlim(left=0.7, right=1.5)
plt.yticks(xval,labels=list(imputers.keys()))
plt.xlabel("RMSE")
for i, c in zip( xval,runtime):
    plt.text(0.465, i+.2, f"runtime: {c/60:.2f} min.")
for i, c in zip( xval,mses_xtrain):
    plt.text(0.465, i-0.4, f"rmse: {c:.3f}")
for i, c in zip( xval,mses_xtrain):
    plt.text(0.465, i-0.5, "___________________")
plt.show()

In [None]:
print(f"imputer min rmse: {list(imputers.keys())[mses_xtrain.argmin()]}")

In [None]:
imputer = imputers["Iterative_imputer5"]

In [None]:
sample_submission.set_index("row-col",inplace=True)

In [None]:
data3=data.copy().set_index("row_id")

In [None]:
data3[:]=imputer.fit_transform(data3)[:,:80]

In [None]:
from tqdm.notebook import tqdm

In [None]:
for i in tqdm(sample_submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sample_submission.loc[i, 'value'] = data3.loc[row, col]

sample_submission.to_csv('submission.csv')

In [None]:
sample_submission