In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
from tqdm import tqdm
from matplotlib import pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Problem Statement

For this challenge, we are given (simulated) manufacturing control data that contains missing values due to electronic errors. 
We need to predict the values of all missing data in this dataset. 

In [None]:
input_path = Path('/kaggle/input/tabular-playground-series-jun-2022/')
data = pd.read_csv(input_path / 'data.csv', index_col='row_id')
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='row-col')

## Data Exploration

### Which columns have missing values?

Everything except the F_2_* columns (int64 columns)

In [None]:
data.columns

In [None]:
data.info()

### Frequency plot of missing values across rows

In [None]:
pd.DataFrame(data.isnull().sum(axis=1), columns = ['Missing value count']).reset_index().groupby('Missing value count').row_id.count().plot(kind='barh')

### Approximately 1-2% of data missing in each column

In [None]:
pd.options.display.float_format = '{:,.2f} %'.format
(data.isnull().sum()/len(data))*100

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

### Which features are correlated with each other?

1. F_2 features are only correlated with F_2 features, so we can't use them to predict any other features. Since they don't have missing values as well, the assumption is to ignore them.
2. F_4 features are correlated with F_4 features , so we can use them to find missing values in F_4 using regression
3. F_1 and F_3 have 0 correlation with any other features, so we need to resort to mean imputation

In [None]:
data_corr = data.corr()
plt.subplots(figsize=(25,20))
sns.heatmap(data_corr, annot= True, cmap="YlOrRd", fmt = '0.1f', vmin=-0.6, vmax=0.6);

### Data distribution of the continuous variables - using boxplot

Acknowledgement : https://www.kaggle.com/code/dhirajkumar612/iterative-imputer-linear-regression

In [None]:
columns_with_correlations = ['F_4_0', 'F_4_1', 'F_4_2', 'F_4_3',
       'F_4_4', 'F_4_5', 'F_4_6', 'F_4_7', 'F_4_8', 'F_4_9', 'F_4_10',
       'F_4_11', 'F_4_12', 'F_4_13', 'F_4_14']

In [None]:
plt.figure(figsize=(18, 18))
for i, col in enumerate(columns_with_correlations): 
    ax = plt.subplot(11,5, i+1) 
    sns.boxplot(data=data[columns_with_correlations],x=col,ax=ax)
plt.suptitle('Data distribution of continuous variables')
plt.tight_layout()                    

### Power transformation

We see that some data is skewed to the left with outliers from the box plot above. Power transformation can help. A power transform will make the probability distribution of a variable more Gaussian.

In [None]:
from sklearn.preprocessing import PowerTransformer
power = PowerTransformer(method='yeo-johnson', standardize=False)
f4_data = pd.DataFrame( power.fit_transform(data[columns_with_correlations]),columns = columns_with_correlations)

In [None]:
plt.figure(figsize=(18, 18))
for i, col in enumerate(columns_with_correlations): 
    ax = plt.subplot(11,5, i+1) 
    sns.boxplot(data=f4_data,x=col,ax=ax)
plt.suptitle('Data distribution of continuous variables')
plt.tight_layout()                    

## Strategy to identify missing values

Let's use mean imputation for F_1 and F_3 features and XgBoost based regression imputer for F_4 features. 

### Mean Imputation

In [None]:
columns_to_mean_impute = ['F_1_0', 'F_1_1', 'F_1_2', 'F_1_3', 'F_1_4', 'F_1_5', 'F_1_6', 'F_1_7',
       'F_1_8', 'F_1_9', 'F_1_10', 'F_1_11', 'F_1_12', 'F_1_13', 'F_1_14',
       'F_3_0', 'F_3_1', 'F_3_2', 'F_3_3', 'F_3_4', 'F_3_5', 'F_3_6', 'F_3_7',
       'F_3_8', 'F_3_9', 'F_3_10', 'F_3_11', 'F_3_12', 'F_3_13', 'F_3_14',
       'F_3_15', 'F_3_16', 'F_3_17', 'F_3_18', 'F_3_19', 'F_3_20', 'F_3_21',
       'F_3_22', 'F_3_23', 'F_3_24']

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(
        missing_values=np.nan,
        strategy='mean')
mean_imputed_data = pd.DataFrame( imp.fit_transform(data[columns_to_mean_impute]),columns = columns_to_mean_impute)

### MICE (Multivariate feature imputation) Regression based imputation

A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion. It performns multiple regressions over random sample ofthe data, then takes the average ofthe multiple regression values and uses that value to impute the missing value. In sklearn, it is implemented as follows:

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import xgboost

In [None]:
reg_imputer = IterativeImputer(estimator=
                               xgboost.XGBRegressor(n_estimators=1000, 
                                       tree_method='gpu_hist', 
                                       predictor="gpu_predictor",
                                       eval_metric=mean_squared_error),
                               verbose=2,
                               max_iter=20)
reg_imputed_data = pd.DataFrame( reg_imputer.fit_transform(f4_data),columns = f4_data.columns)

### Join both imputations

In [None]:
final_imputed_data = pd.concat([mean_imputed_data, reg_imputed_data], axis=1)

In [None]:
final_imputed_data.isna().sum()

## Generate Submission File

In [None]:
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i,'value']= final_imputed_data.loc[row, col]
    
submission.to_csv('mean_and_regression_imputer_power_transform_xgboost.csv')