In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import gc
from multiprocessing import Pool
from functools import partial
import seaborn as sns
import matplotlib.pyplot as plt

# Step 0: Helper functions

In [None]:
def set_missing_values(df, incomplete_variables, missing_ratio,random_state=42):
    """
    To be used for cross validation, parameter tunning etc.
    
    Inputs:
        df : pd.DataFrame
            Dataframe to be enriched with missing values
        incomplete_variables : list
            List of variables subject to missing value generation 
        missing_ratio : float
            Ratio of missing variables e.g.: 0.0187 translates to 1.87% per col
        random_state : int
            Random state value in integer for reproducibility
    Returns:
        pd.DataFrame
            Dataframe with missing col values
    """
    for col in incomplete_variables:
        df.loc[df.sample(frac=missing_ratio,random_state=random_state).index, col] = np.nan
        # I introduce this to give some perturbation to my random states to avoid cyclicality
        # While I still maintain reproducibility
        random_state=random_state * 6397 % 7919
    return df

def generate_missing_ids(df,incomplete_variables):
    """
    To be used to determine missing col-row values
    
    Inputs:
        df : pd.DataFrame
            Dataframe to be enriched with missing values
        incomplete_variables : list
            List of variables subject to missing value generation 
    Returns:
        pd.DataFrame
            Dataframe with missing row-col values
    """
    missing_values=list()
    for col in incomplete_variables:
        missing_values=missing_values+[str(l) + "_" + col for l in list(df[df[col].isnull()].index)]
    return pd.DataFrame(missing_values,columns=["row-col"])

def grab_values(df, missing_ids, incomplete_variables, variable_name="predict"):
    """
    To be used to get missing col-row values from tables that contain the prediction
    
    Inputs:
        df : pd.DataFrame
            Dataframe that has the predictions
        missing_ids: pd.DataFrame
            Dataframe that contains the list of missing row-col pairs
        incomplete_variables : list
            List of variables subject to missing value generation 
        variable_name : str
            Variable name for predictions
    Returns:
        pd.DataFrame
            Dataframe with the fill missed fields
    """
    missing_values=list(missing_ids["row-col"])
    summary_df=pd.DataFrame()
    for col in incomplete_variables:
        condition=(df.index+"_"+col).isin(missing_values)
        tdf=df.loc[condition]
        summary_df=summary_df.append(pd.DataFrame({
            "row-col": tdf.index+"_"+col,
            variable_name: tdf[col],
        }))
    return summary_df

def fill_miss_means(df,incomplete_variables):
    """
    To be used to get missing col-row values from tables that contain the prediction
    
    Inputs:
        df : pd.DataFrame
            Dataframe that has the predictions
        incomplete_variables : list
            List of variables subject to missing value generation 
    Returns:
        No return, the original dataframe is modified and enriched with predictions
    """
    for col in incomplete_variables:
        mean_temp=np.nanmean(df[col])
        df.loc[df[col].isnull()]=mean_temp
        
def get_rmse(all_together, v1="realized", v2="forecast", to_print=False):
    """
    Calculates RMSE for standard output
    
    Input:
        all_together: pd.DataFrame
            Df containing the forecast
        v1: str
            String for the name of variable e.g. realized
        v2: str
            String for the name of variable e.g. predict or forecast
    
    Returns:
        rmse: float
            Returns the float containign the RMSE
    """
    all_together["diff"]=(all_together[v1]-all_together[v2])**2
    rmse=np.sqrt(np.nansum((all_together[v1]-all_together[v2])**2)/len(all_together))
    if to_print:
        print("RMSE : " + str(round(rmse,2)))
    return rmse, all_together[["row-col","diff"]]

def plot_rmse(rmse_list, title=""):
    plt.hist(rmse_list,color="#BA55D3");
    plt.title("RMSE values in cross validation"+title);
    print("Mean: "+str(round(np.mean(rmse_list),4)))
    print("Std: "+str(round(np.std(rmse_list),4)))
    print("Min: "+str(round(np.min(rmse_list),4)))
    print("Max: "+str(round(np.max(rmse_list),4)))
    print("Folds: "+str(len(rmse_list)))

In [None]:
def single_perform_cross_validation(random_state,
                                    data_complete,
                                    incomplete_variables,
                                    missing_ratio,
                                    all_variables,
                                    fill_miss_engine=None,
                                    subsample=0.1,
                                    **kwargs):
        random_state=random_state*100
        # 0. Use complete data and generate test data
        if subsample is not None:
            data_complete_small=data_complete.sample(frac=subsample, random_state=1)
        else:
            data_complete_small=data_complete.copy()

        data_pipeline=data_complete_small.copy()
        data_pipeline=set_missing_values(data_pipeline,
                                          incomplete_variables,
                                          missing_ratio,
                                          random_state=random_state)
        # 1. get missing ids
        missing_ids=generate_missing_ids(data_pipeline,incomplete_variables)

        # 2. get realized data
        value_realized=grab_values(df=data_complete_small,
                               missing_ids=missing_ids,
                               incomplete_variables=incomplete_variables,variable_name="realized")
        # 3. perform prediction
        fill_miss_engine(data_pipeline, incomplete_variables, **kwargs)

        # 4. get predicted data in format
        value_forc=grab_values(df=data_pipeline,
                               missing_ids=missing_ids,
                               incomplete_variables=incomplete_variables,variable_name="forecast")
        # 5. evaluate
        all_together=value_forc.merge(value_realized,how="outer",on="row-col")
        return get_rmse(all_together)

def perform_cross_validation_parallel(data_complete,
                                         incomplete_variables,
                                         missing_ratio,
                                         all_variables,
                                         fill_miss_engine=None,
                                         subsample=0.1,
                                         repeat=3,
                                         processes=1,
                                         **kwargs,
                                        ):
    """
    Performs cross validation using a prespecified engine
    
    Inputs:
        data_complete : pd.DataFrame
            Dataframe to be enriched with missing values
        incomplete_variables : list
            List of variables subject to missing value generation 
        missing_ratio : float
            Ratio of missing variables e.g.: 0.0187 translates to 1.87% per col
        all_variables : list
            List of ALL variables in the dataframe 
        subsample : scalar
            Set subsample to use only a certain ratio of the data e.g. 0.1=10%
        repeat : int
            Number of repeats with different random_state
        fill_miss_engine : function
            The engine expects the data_complete and the all_variables as the input
        processes : int
            Number of processes to be used in pool
    Returns:
        list
            Dataframe containing cross validation rmse values
    """   
    
    with Pool(processes=processes) as pool: 
        rmse_list=pool.map(
                          partial(single_perform_cross_validation, 
                                 data_complete=data_complete,
                                 incomplete_variables=incomplete_variables,
                                 missing_ratio=missing_ratio,
                                 all_variables=all_variables,
                                 fill_miss_engine=fill_miss_engine,
                                 subsample=subsample,
                                 **kwargs
                                 ), range(1,repeat+1) )
    
    rmse_values=list()
    df_list=list()
    for elem in rmse_list:
        rmse_values.append(elem[0])
        df_list.append(elem[1])
    df_final=pd.concat(df_list,axis=0)
    return rmse_values, df_final

# Step 1: Read data and explore

In [None]:
# display sample submission to see the result format:
sample_submit=pd.read_csv("/kaggle/input/tabular-playground-series-jun-2022/sample_submission.csv")
display(sample_submit.head())

In [None]:
data0=pd.read_csv("/kaggle/input/tabular-playground-series-jun-2022/data.csv")

In [None]:
display(data0.head())

In [None]:
display(data0.describe().T)

In [None]:
display(data0.info())

### -> Around 2 % of the records are mising in each column

In [None]:
all_variables=list(data0.columns[1:])
complete_variables=[var for var in all_variables if var.startswith("F_2")]
incomplete_variables=[var for var in all_variables if not var.startswith("F_2")]
# Save a complete table without any missing
data_complete=data0.copy().dropna(axis="rows")
print("Rows without any missing values: "+str(len(data_complete)))
missing_ratio=1-0.9815

In [None]:
data_complete["row_id"]=data_complete["row_id"].astype(str)
data_complete.set_index(keys=["row_id"],inplace=True)

### -> Non missing observations are around 364 thousand (36,4% of total)
* <font size="4"> We have 55 columns with missing values, each column have approximately 2% missing")</font>
* <font size="4"> The probability that a row has no missing at all is (1-2%)^55 which is more or less around 30-40%")</font>
* <font size="4"> So at first glance no correlation between the missing values of the observations</font>
* <font size="4"> We might change our view later once we see more interaction between variables</font>


# Step 2: Naive approach with simple means

In [None]:
%%time
rmse_list_means, mse_df_means=perform_cross_validation_parallel(data_complete=data_complete,
                                                                 incomplete_variables=incomplete_variables,
                                                                 missing_ratio=missing_ratio,
                                                                 all_variables=all_variables,
                                                                 fill_miss_engine=fill_miss_means,
                                                                 subsample=0.1,  
                                                                 repeat=24,
                                                                 processes=4             
                                                                 )

In [None]:
plot_rmse(rmse_list_means, title=" (Simple Means)")

<font size="4"> Notes: the scores observed here are around the mean benchmark which is 1.41613 </font>

# Step 3: KNN imputer with the wrapper function

<font size="4"><b> Explanation in steps </font></b><br>
<font size="3">1. We going to create distances which are sensitive to scaling, hence we standardize </font><br>
<font size="3">2. KNNImputer identifies the "n_neighbors" number of nearest neighbors </font><br>
<font size="3">3. The imputed value will be the weighted average of the "n_neighbors" points feature values, these neighbors have non missing feature exposures </font><br>
<font size="3">4. We can have multiple weighting schemas, the default is 'uniform' (simple average), the 'distance' option sets the weights inversely proportional to the distance, we can also add our own wieghting function </font><br>
<font size="3">5. Finally we need to scale back our data to the original scale </font>

<a href="https://scikit-learn.org/stable/modules/impute.html#knnimpute">Please read more in KNNImputer documentation</a>

In [None]:
def KNN_engine(data_pipeline, all_variables, **kwargs):
    scaler = StandardScaler()
    data_pipeline.loc[:,all_variables]=scaler.fit_transform(data_pipeline.loc[:,all_variables])
    imputer = KNNImputer(**kwargs)
    data_pipeline.loc[:,all_variables] = imputer.fit_transform(data_pipeline.loc[:,all_variables])
    data_pipeline.loc[:,all_variables] = scaler.inverse_transform(data_pipeline.loc[:,all_variables])
    gc.collect()

In [None]:
%%time
knn1_rmse_list,_ =perform_cross_validation_parallel(data_complete=data_complete,
                                 incomplete_variables=incomplete_variables,
                                 missing_ratio=missing_ratio,
                                 all_variables=all_variables,
                                 fill_miss_engine=KNN_engine,
                                 subsample=0.04,  
                                 repeat=12,
                                 processes=4,               
                                 n_neighbors=20,
                                 weights = 'distance')

In [None]:
plot_rmse(knn1_rmse_list, title=" (KNN distance weighted)")

In [None]:
%%time
knn2_rmse_list,_ =perform_cross_validation_parallel(data_complete=data_complete,
                                 incomplete_variables=incomplete_variables,
                                 missing_ratio=missing_ratio,
                                 all_variables=all_variables,
                                 fill_miss_engine=KNN_engine,
                                 subsample=0.04,  
                                 repeat=12,
                                 processes=4,               
                                 n_neighbors=20,
                                 weights = 'uniform')

In [None]:
plot_rmse(knn2_rmse_list, title=" (KNN uniform weighted)")

# Step 4: Iterative imputer with Simple OLS

<font size="4"><b> Explanation in steps </font></b><br>
<font size="3">0. The framework if built in a flexible way, so it can use multiple estimators</font><br>
<font size="3">1. We going to standardize the data with a scaler, which will be handy if regularization is used</font><br>
<font size="3">2. IterativeImputer estimates the missing values for a given feature by using the other features as explanatory variables </font><br>
<font size="3">3. When a model for a feature is estimated the missing values are estimated based on that model and the iteration proceeds to the next feature </font><br>
<font size="3">4.  When a new iteration starts (all features were fill missed at least once) the estimation is performed using the newly perfomed fill missed data as well, and the estimates are refined for the missing values using the new estimates </font><br>
<font size="3">5. The iteration goes on, the default "max_iter" param, hence the number of iterations is 10. </font><br>
<font size="3">6. Finally we need to scale back our data to the original scale </font>

<a href="https://scikit-learn.org/stable/modules/impute.html#iterative-imputer">Please read more in IterativeImputer documentation</a>

In [None]:
from sklearn.experimental import enable_iterative_imputer
#We need to add the above import so the iterativeimputer can work
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor

In [None]:
def IterativeImputerBase(data_pipeline, all_variables, **kwargs):
    scaler = StandardScaler()
    data_pipeline.loc[:,all_variables]=scaler.fit_transform(data_pipeline.loc[:,all_variables])
    imputer = IterativeImputer(random_state=42,**kwargs)
    data_pipeline.loc[:,all_variables] = imputer.fit_transform(data_pipeline.loc[:,all_variables])
    data_pipeline.loc[:,all_variables] = scaler.inverse_transform(data_pipeline.loc[:,all_variables])
    gc.collect()

In [None]:
%%time
ols_rmse_list, ols_rmse_df =perform_cross_validation_parallel(data_complete=data_complete,
                                 incomplete_variables=incomplete_variables,
                                 missing_ratio=missing_ratio,
                                 all_variables=all_variables,
                                 fill_miss_engine=IterativeImputerBase,
                                 subsample=0.02,  
                                 repeat=12,
                                 processes=4,
                                 estimator=LinearRegression(),
                                 max_iter=30,
                                 )

In [None]:
plot_rmse(ols_rmse_list, title=" (Iterative OLS)")

# Step 5: Iterative imputer with DecisionTreeRegressor and LGBMRegressor

In [None]:
%%time
cb_rmse_list, cb_rmse_df =perform_cross_validation_parallel(data_complete=data_complete,
                                 incomplete_variables=incomplete_variables,
                                 missing_ratio=missing_ratio,
                                 all_variables=all_variables,
                                 fill_miss_engine=IterativeImputerBase,
                                 subsample=0.02,  
                                 repeat=12,
                                 processes=4,
                                 estimator=DecisionTreeRegressor(random_state=11),
                                 max_iter=10,
                                 )

In [None]:
plot_rmse(cb_rmse_list, title=" (Iterative RegressionTree)")

In [None]:
%%time
lgbm_rmse_list, lgbm_rmse_df =perform_cross_validation_parallel(data_complete=data_complete,
                                 incomplete_variables=incomplete_variables,
                                 missing_ratio=missing_ratio,
                                 all_variables=all_variables,
                                 fill_miss_engine=IterativeImputerBase,
                                 subsample=0.02,  
                                 repeat=12,
                                 processes=2,
                                 estimator=LGBMRegressor(random_state=11,n_estimators=10),
                                 max_iter=2,
                                 )

In [None]:
plot_rmse(lgbm_rmse_list, title=" (Iterative LGMB regressor)")

### Note 1: The "more" sophisticated methods take a lot of time to finish unfortunately
### Note 2: We are close to leaderboard benchmarks even with these small subsamples