In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/data.csv', index_col='row_id')

# EDA

In [None]:
data.head()

In [None]:
data.describe()

## Data avilability heatmap

In [None]:
from matplotlib.colors import ListedColormap

colors = [(0.1, 0.1, 0.1), (0.9, 0.9, 0.9)]
cmap = ListedColormap(colors)
plt.figure(figsize=(25,8))
ax = sns.heatmap(data.isna(), yticklabels=False, cmap=cmap)
cbar = ax.collections[0].colorbar
cbar.ax.set_yticklabels(labels=['non-missing', 'missing'])
cbar.ax.set_yticks([0.25, 0.75])
_ = ax.set(ylabel = None)

## The number of missing features per row

In [None]:
missing_per_row = pd.DataFrame({'num of missing features': data.isna().sum(axis=1)})
g = sns.displot(missing_per_row, kind='hist', aspect=2, rug=True, x='num of missing features', binwidth=0.2)
g.fig.suptitle('Missing features per record distribution', fontdict={"weight": "bold"})

The number of missing fields per record ranges from 0 missing fields to 9 missing fields

In [None]:
nan_cols = data.columns[data.isna().any()]
nan_indices = data.index[data.isna().any(axis=1)]
non_nan_cols = data.columns[~data.isna().any()]

In [None]:
ax = data[nan_cols].isna().sum().plot.bar(figsize=(20, 8))
ax.set_title('Count of missing values')

In [None]:
print('Columns with missing data have an average of {} missing values'.format(int(data[nan_cols].isna().sum().mean())))

## Distribution of columns containing missing values

In [None]:
fig, axes = plt.subplots(7, 8, figsize=(18,14))
axes = axes.ravel()

fig.suptitle('Distribution of columns containing missing values', fontsize=16)
for col, ax in zip(nan_cols, axes):
    sns.histplot(data[col], ax=ax)
    ax.set(ylabel = None, xlabel=None)
    ax.set_title(col)
fig.tight_layout()

The columns values appear to be continuous and normally distributed 

## Distribution of columns that do not contain missing values

In [None]:
fig, axes = plt.subplots(5, 5, figsize=(13,10))
axes = axes.ravel()

fig.suptitle('Distribution of non-missing values columns', fontsize=16)
for col, ax in zip(non_nan_cols, axes):
    sns.histplot(data[col], ax=ax, binwidth=0.2)
    ax.set(ylabel = None, xlabel=None)
    ax.set_title(col)
fig.tight_layout()

The columns values appear to be categorical

# Imputing missing data

Missing data types falls into 3 types:
* Missing completely at random **(MCAR)**: The missing fields are neither determined by the value of the missing field nor the value of other observed field. Simple imputation methods could be used at this case
* Missing at random **(MAR)**: The missing values are correlated to other variable in the data set. Advanced imputation methods should be applied
* Missing not at random **(MNAR)**: Fileds values themselves correlate to missing fields. This is the hardest case to impute, missingness mechanism should be modeled, since there are no any information regarding the nature of the data it is almost impossible to detect and impute this type of missingness

## scikit-learn SimpleImputer

Imputing missing values using statical methods with `SimpleImputer` has different strategies; mean, median, most_frequent, and constant.
Since the missing-data columns are normally distributed, the mean and median imputation nearly would be the same. most_frequent strategy is not suitable here because the missing-data columns are continuous not catigorical. And of course imputing with a constant value strategy is not a wise decision

In [None]:
from sklearn.impute import SimpleImputer

meanImp = SimpleImputer(
        missing_values=np.nan,
        strategy='mean')

train, test = train_test_split(data[nan_cols], test_size=0.1, random_state=42)
meanImp.fit(train)

test = test.dropna()
pred = pd.DataFrame(meanImp.transform(np.full_like(test, np.nan)), columns=train.columns)

SimpleImputer_eval = dict()
for target in nan_cols:
    rmse = mean_squared_error(test[target], pred[target], squared=False)
    r2 = r2_score(test[target], pred[target])
    SimpleImputer_eval[target] = [rmse, r2]
    #print('{} column, RMSE = {}, R2 = {}'.format(target, rmse, r2))

In [None]:
SimpleImputer_eval_df = pd.DataFrame(data=SimpleImputer_eval, index=['RMSE', 'R2']).transpose()
ax = SimpleImputer_eval_df.plot.bar(figsize=(20, 8))
ax.set_title('Mean Imputation')

R2 value is nearly 0 for all missing columns as expected

## Regression Imputation

One tecnique of imputing missing data is predicting the missing values using a regression model, since there are records with more than one missing feature which means features that will be fed to the algorithm could contain itself a missing features, a regression algorithim that handels missing data is needed.
This is why LightGBM Regressor is used because it handles missing value by default


In [None]:
from lightgbm import LGBMRegressor

LGBM_models = dict()
LGBM_eval = dict()
for target in nan_cols:
    clean_data = data.dropna(subset=[target])
    X = clean_data.drop(target, axis=1)
    y = clean_data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    model = LGBMRegressor(n_estimators=1500, metric='rmse')
    model.fit(X_train, y_train, categorical_feature=[*non_nan_cols])
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    LGBM_models[target] = model
    LGBM_eval[target] = [rmse, r2]
    #print('{} column, RMSE = {}, R2 = {}'.format(target, rmse, r2))

In [None]:
LGBM_eval_df = pd.DataFrame(data=LGBM_eval, index=['RMSE', 'R2']).transpose()
ax = LGBM_eval_df.plot.bar(figsize=(20, 8))
ax.set_title('LightGBM Regressor')

R2 values shows that only fetures with prefix F_4_ could be predicted with a regression model, for the rest of the features R2 values shows that the model is not doing better than imputing with the mean

## Comparing LightGBM Regressor and Mean Imputation

In [None]:
compare_df = pd.DataFrame()
compare_df['LightGBM Regressor'] = LGBM_eval_df['RMSE']
compare_df['Mean Imputation'] = SimpleImputer_eval_df['RMSE']

ax = compare_df.plot.bar(figsize=(20, 8))
ax.set_title('LightGBM Regressor VS Mean Imputation')
ax.set_ylabel('RMSE')

In [None]:
predictions = pd.DataFrame(index=nan_indices)
for target in tqdm(nan_cols):
    X = data.loc[nan_indices].drop(target, axis=1)
    predictions[target] = LGBM_models[target].predict(X)

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = predictions.loc[row, col]

In [None]:
submission.to_csv('LGBMRegressor_1500-estimator.csv')