In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## In this competition, we have been given only one dataset with some missing values and our goal is to predict for those missing values. 

## The aim of this notebook is to show how to create training and validation sets from the entire data and use that to improve the model's performance.

## Some functions to generate plots have been taken from [this](https://www.kaggle.com/code/robikscube/handling-with-missing-data-youtube-stream) notebook.

## Update

#### 1. The actual % of missing values per column is 1.8% and not 18%. This has been rectified.
#### 2. A Cross validation loop has been included in place of simple train_test_split.
#### 3. The metric has been changed to RMSE instead of MSE.

In [None]:
import matplotlib.pylab as plt
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']

import warnings
warnings.filterwarnings("ignore")
plt.style.use('ggplot')

import seaborn as sns

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")
ss = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv")

In [None]:
train.shape

## Function to plot % of missing values

In [None]:
def show_perc_values_missing(df):
    ncounts = pd.DataFrame([df.isna().mean()]).T
    ncounts = ncounts.rename(columns={0: "train_missing"})

    ncounts.query("train_missing > 0").plot(
        kind="barh", figsize=(8, 15), title="% of Values Missing"
    )
    plt.show()

In [None]:
show_perc_values_missing(train)

## ~1.8% of values is missing in each column

In [None]:
#List of all columns with missing values
nacols = ['F_1_0', 'F_1_1', 'F_1_2', 'F_1_3', 'F_1_4', 'F_1_5', 'F_1_6',
       'F_1_7', 'F_1_8', 'F_1_9', 'F_1_10', 'F_1_11', 'F_1_12', 'F_1_13',
       'F_1_14', 'F_3_0', 'F_3_1', 'F_3_2',
       'F_3_3', 'F_3_4', 'F_3_5', 'F_3_6', 'F_3_7', 'F_3_8', 'F_3_9', 'F_3_10',
       'F_3_11', 'F_3_12', 'F_3_13', 'F_3_14', 'F_3_15', 'F_3_16', 'F_3_17',
       'F_3_18', 'F_3_19', 'F_3_20', 'F_3_21', 'F_3_22', 'F_3_23', 'F_3_24',
       'F_4_0', 'F_4_1', 'F_4_2', 'F_4_3', 'F_4_4', 'F_4_5', 'F_4_6', 'F_4_7',
       'F_4_8', 'F_4_9', 'F_4_10', 'F_4_11', 'F_4_12', 'F_4_13', 'F_4_14']

In [None]:
#Generate a feature to track the number of missing values in a row.
train["n_missing"] = train[nacols].isna().sum(axis=1)

In [None]:
train["n_missing"].value_counts().plot(
    kind="bar", title="Number of Missing Values per Sample"
)

In [None]:
rows_with_no_missing = train.query("n_missing == 0")
rows_with_no_missing

## ~36% of the rows have no missing values. 

## We can use these 36% of the rows to create training and validation sets. But we have to keep certain things in mind to ensure that the train data we create resembles the original dataset as closely as possible. This is done by the below steps.

* Introduce missing values **"randomly"** in the 36% of the rows with no missing data.
* Make sure to keep "F_2_*" columns as non-missing.
* Ensure around 1.8% of the data is misssing in each column.

## Since we know the ground truth values for these 36% of the rows, we can use the training and validation sets thus created to compare models and improve performance.

In [None]:
rows_with_no_missing.drop(['row_id', 'n_missing'], axis=1, inplace=True)

## Create a copy of the dataframe containing rows with no missing values and then randomly introduce ~1.8% of missing values in each column

In [None]:
sub_train_ground_truth = rows_with_no_missing.copy()

In [None]:
for col in nacols:
    vals_to_nan = rows_with_no_missing[col].sample(frac=0.018).index
    rows_with_no_missing.loc[vals_to_nan, col] = np.NaN

In [None]:
# Check % of missing values in newly created dataframe
show_perc_values_missing(rows_with_no_missing)

## Exactly 1.8% of the values is now missing in our data. Now we will split it into training and validation sets, try an Imputer on both sets and check its performance.

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

## Simple Imputer

In [None]:
from sklearn.impute import SimpleImputer
imptr = SimpleImputer(strategy="mean", add_indicator=False)

In [None]:
kf = KFold(n_splits=5)

mse_train=[]
mse_valid=[]

for i, (train_idx, valid_idx) in enumerate(kf.split(rows_with_no_missing)):
    print("CV fold:" + str(i+1))
    
    train_imputed = imptr.fit_transform(rows_with_no_missing.iloc[train_idx])
    train_imputed = pd.DataFrame(train_imputed, columns=rows_with_no_missing.columns)
    
    valid_imputed = imptr.transform(rows_with_no_missing.iloc[valid_idx])
    valid_imputed = pd.DataFrame(valid_imputed, columns=rows_with_no_missing.columns)
    
    y_train = sub_train_ground_truth.iloc[train_idx]
    y_valid = sub_train_ground_truth.iloc[valid_idx]
    
    mse_train.append(np.sqrt(mean_squared_error(y_train, train_imputed)))
    mse_valid.append(np.sqrt(mean_squared_error(y_valid, valid_imputed)))
    
    print("Training error:" + str(np.sqrt(mean_squared_error(y_train, train_imputed))))
    print("Validation error:" + str(np.sqrt(mean_squared_error(y_valid, valid_imputed))))

In [None]:
print("The average training error is: " + str(sum(mse_train)/len(mse_train)))
print("The average CV error is: " + str(sum(mse_valid)/len(mse_valid)))

## LGBM Imputer

In [None]:
# !rm -r kuma_utils
!git clone https://github.com/analokmaus/kuma_utils.git

In [None]:
import sys
sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

In [None]:
lgbm_imtr = LGBMImputer(n_iter=100, verbose=True)

mse_train_lgbm=[]
mse_valid_lgbm=[]

for i, (train_idx, valid_idx) in enumerate(kf.split(rows_with_no_missing)):
    print("CV fold:" + str(i+1))
    
    train_imputed = lgbm_imtr.fit_transform(rows_with_no_missing.iloc[train_idx])
    train_imputed = pd.DataFrame(train_imputed, columns=rows_with_no_missing.columns)
    
    valid_imputed = lgbm_imtr.transform(rows_with_no_missing.iloc[valid_idx])
    valid_imputed = pd.DataFrame(valid_imputed, columns=rows_with_no_missing.columns)
    
    y_train = sub_train_ground_truth.iloc[train_idx]
    y_valid = sub_train_ground_truth.iloc[valid_idx]
    
    mse_train_lgbm.append(np.sqrt(mean_squared_error(y_train, train_imputed)))
    mse_valid_lgbm.append(np.sqrt(mean_squared_error(y_valid, valid_imputed)))
    
    print("Training error:" + str(np.sqrt(mean_squared_error(y_train, train_imputed))))
    print("Validation error:" + str(np.sqrt(mean_squared_error(y_valid, valid_imputed))))

In [None]:
print("The average training error is: " + str(sum(mse_train_lgbm)/len(mse_train_lgbm)))
print("The average CV error is: " + str(sum(mse_valid_lgbm)/len(mse_valid_lgbm)))

In [None]:
data = [['Simple_Imputer','0.1578772878904608', '0.15787718194045347'], ['LGBM_Imputer', '0.11630687954281457', '0.11633002301138315' ]]
df = pd.DataFrame(data, columns=['Model','Training_Error', 'Cross Validation_Error'])
 
df

## Will add and compare more models that have been used in this competition so far.

# Thanks for reading!!