In [None]:
# set up LightGBM to use GPUs. See https://www.kaggle.com/code/dromosys/gpu-accelerated-lightgbm-full/notebook

!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
!apt-get install -y -qq libboost-all-dev

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

## In this notebook, I will demonstrate various techniques for imputing data. While the results are for the TPS June 2022 challenge, the various functions and concepts can be used on any data science project. Please feel free to use whatever code or idedas help and give this notebook and upvote!

In [None]:
import os, random, time
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from scipy.stats import mode

from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.compose import ColumnTransformer
from category_encoders import MEstimateEncoder
from lightgbm import LGBMClassifier, LGBMRegressor, early_stopping

import seaborn as sns
from matplotlib import pyplot as plt

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# 1. Exploratory Data Analysis
- look at the data set, and in particular, what data types are present and what gaps exist
- get counts of missing data
- look for any patterns in the missing data (see [here](https://www.kaggle.com/code/abdulravoofshaik/top-3-solution-lgbm-mean/notebook?scriptVersionId=97501106) for code and ideas used in this analysis)

In [None]:
df = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv", index_col='row_id')

In [None]:
df.head()

In [None]:
plt.figure(figsize=(20,8))
nulls = df.isnull().sum()
sns.barplot(x= nulls.index, y=nulls.values)
plt.title("Counts of Missing Values per Column")

In [None]:
plt.figure(figsize=(20,8))
sns.heatmap(df.isnull(), yticklabels=False, cbar=False)
plt.title("Locations of Null Values in the Data")

In [None]:
# look at counts of missing values for records

record_nulls = df.isnull().sum(axis=1)

In [None]:
np.unique(record_nulls, return_counts=True)

In [None]:
# look at just those in 'F_4' category

f_4_df = df[df.columns[df.columns.isin([i for i in list(df.columns) if i.split('_')[1] in ['4']])]]
f_4_record_nulls = f_4_df.isnull().sum(axis=1)

In [None]:
np.unique(f_4_record_nulls, return_counts=True)

- We have about 20-18% missing values within each column, expcet for the 'F_2' columns
- Each of variables appear to be numeric, so model-based imputation will be regression based
- The columns with 'F_4' variable type appear to have some correlations, as do te 'F_2' columns

# 2. Specify Helper Functions
- function for making sure categorical columns are labeled as such
- function for encoding categorical columns as a label encoder, with a function for reversing the encoding

In [None]:
def specify_categoricals(df):
    # Nominal categories
    for name in df.select_dtypes("object"):
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name].cat.add_categories("None", inplace=True)
    return df

In [None]:
def label_encode(df):
    X = df.copy()
    X_cat = X.select_dtypes(["category"])
    columns = X_cat.columns
    enc = OrdinalEncoder()
    X_cat = enc.fit_transform(X_cat)
    X[columns] = X_cat
    return X, enc, columns

# 3. Specify Different Imputers
- Base imputer using some measure like mean or median for numerical, and mode for categorical
- kNN imputer
- Imputation by machine learning model (in this case, LightGBM)

In [None]:
def simple_impute(df):
    '''
    Impute the numerical columns by the median value for each column and
    impute the categorical columns by the most frequent, or mode, for each column
    Note: one can easily switch in different imputers for each of the data types to something like kNN or iterative
    '''
    
    # Impute missing values for numerical data
    # imp_num = IterativeImputer(estimator=ExtraTreesRegressor(), initial_strategy='median', max_iter=20)
    imp_num = SimpleImputer(strategy='mean')
    numerical_df = df.select_dtypes("number")
    numerical_df = pd.DataFrame(data=imp_num.fit_transform(numerical_df), index=numerical_df.index, columns =numerical_df.columns)
    
    if df.select_dtypes("category").shape[1] >0:
        # Imput missing values for categorical data
        # imp_cat = IterativeImputer(estimator=ExtraTreesClassifier(), initial_strategy='most_frequent', max_iter=20)
        imp_cat = SimpleImputer(strategy='most_frequent')
        categorical_df = df.select_dtypes("category")
        enc = OrdinalEncoder()
        categorical_df = pd.DataFrame(data=enc.fit_transform(categorical_df), columns=categorical_df.columns)
        categorical_imputations = enc.inverse_transform(imp_cat.fit_transform(categorical_df))
        categorical_df = pd.DataFrame(data=categorical_imputations, index=categorical_df.index, columns =categorical_df.columns, dtype="category")
        return categorical_df.join(numerical_df).reindex(columns= df.columns)
    else:
        return numerical_df

In [None]:
def knn_impute(df):
    '''
    Impute the numerical columns by the k-Nearest Neighbors for each column and
    impute the categorical columns by the most frequent, or mode, for each column.
    Note: For the numerical impute, one-hot encode the categorical variables first.
    Then, for the categorical variables just use the categorical variables
    '''
    
    numerical_df = df.select_dtypes("number")
    if df.select_dtypes("category").shape[1] >0:
        categorical_df = df.select_dtypes("category")
        enc = OneHotEncoder(sparse=False)
        one_hot_df = pd.DataFrame(data=enc.fit_transform(categorical_df), index=categorical_df.index, columns =enc.get_feature_names_out(categorical_df.columns))

    # Begin with imputing the numerical data by kNN first
    scaler = MinMaxScaler()
    numerical_df = pd.DataFrame(data=scaler.fit_transform(numerical_df), columns=numerical_df.columns)
    if df.select_dtypes("category").shape[1] >0:
        numerical_df = numerical_df.join(one_hot_df) # add in categorical features, if they exist
    numerical_imputer = KNNImputer(n_neighbors=5)
    numerical_imputations = scaler.inverse_transform(numerical_imputer.fit_transform(numerical_df)) # impute the data and return it back to original scales
    if df.select_dtypes("category").shape[1] >0:
        numerical_df = pd.DataFrame(data = numerical_imputations[:,:-one_hot_df.shape[1]], index = numerical_df.index, columns =numerical_df.columns[:-one_hot_df.shape[1]])
    else:
        numerical_df = pd.DataFrame(data = numerical_imputations, index = numerical_df.index, columns =numerical_df.columns)
        
    # Now impute the categorical variables
    if df.select_dtypes("category").shape[1] >0:
        # Imput missing values for categorical data
        # imp_cat = IterativeImputer(estimator=ExtraTreesClassifier(), initial_strategy='most_frequent', max_iter=20)
        imp_cat = SimpleImputer(strategy='most_frequent')
        categorical_df = df.select_dtypes("category")
        enc = OrdinalEncoder()
        categorical_df = pd.DataFrame(data=enc.fit_transform(categorical_df), columns=categorical_df.columns)
        categorical_imputations = enc.inverse_transform(imp_cat.fit_transform(categorical_df))
        categorical_df = pd.DataFrame(data= categorical_imputations, index=categorical_df.index, columns =categorical_df.columns, dtype="category")
        return categorical_df.join(numerical_df).reindex(columns= df.columns)
    else:
        return numerical_df

In [None]:
def ML_impute(df, params = {}):
    '''
    Impute missing values by treating the imputational as a machine learning problem. For numerical
    columns, we can treat the problem as a regression problem, and for categorical, a classification problem.
    For this method, we'll iterate through all of the columns with one column being the target variable
    and the others as being predictor variables
    '''
    
    df = df.copy()
    
    # label encode categorical variables
    columns = df.columns.to_list()
    cat_cols = df.select_dtypes("category").columns.to_list()
    enc = OrdinalEncoder()
    df[cat_cols] = enc.fit_transform(df[cat_cols])
    
    # Randomized column selection
    for i in random.sample(range(len(df.columns)), len(df.columns)):
    
    # Starting with most null values to least
    # for i in np.argsort(-df.isnull().sum().values):
        column = columns[i]
        # Check to make sure there are null values that need to be imputed
        if not df[column].isnull().any():
            continue
        
        print("Imputing Column: {}".format(column))
            
        # Create train, test, and validation data using the null values of the column of interest
        X_train = df.loc[df[column].notnull()]
        y_train = X_train.pop(column)
        
        X_test = df.loc[df[column].isnull()]
        _ = X_test.pop(column)
        
        # If we have more data, we use more estimators for the imputation model
        n_estimators = min(5000, int(len(X_train) / 10))

        if column in cat_cols:
            model = LGBMClassifier(**params, n_estimators=n_estimators, device='gpu')
        else:
            model = LGBMRegressor(**params, n_estimators=n_estimators, device='gpu')
        
        model.fit(X_train, y_train)
        print("Score of Column {} is {}".format(column, model.score(X_train,y_train))) 
        preds =model.predict(X_test)
        m = df[column].isna()
        df.loc[m, column]  = preds.flatten()
        
    # Recode the categorical variables to their original values
    if len(cat_cols) >0:
        df[cat_cols] = enc.inverse_transform(df[cat_cols])
        
    return df

In [None]:
def ensemble_impute(df, num_runs=5, params = {}):
    '''
    Since the order of columns can mattter in a model-based, iterative imputation, with this function
    we create an ensemble of model based imputations - with different column orderings - to get more
    robust imputation results
    '''
    
    final_df = df.copy()
    original_df = df.copy()
    
    #encode the categorical variables
    cat_cols = df.select_dtypes("category").columns.to_list()
    enc = OrdinalEncoder()
    df[cat_cols] = enc.fit_transform(df[cat_cols])
    
    imputed_dfs = []
    for run in range(num_runs):
        start_time = time.time()
        df = ML_impute(original_df, params)
        imputed_dfs.append(df)
        print("--------- Run {} Complete ----------".format(run))
        print("---- Run {} Complete in time {:.3f} -----".format(run, time.time()-start_time))
    
    # Use mode across the categorical columns and recode back to original variables formats
    if len(cat_cols) >0:
        final_df[cat_cols] = np.squeeze(mode(np.array([df[cat_cols].values for df in imputed_dfs]), axis=0)[0], axis=0)
        final_df[cat_cols] = enc.inverse_transform(final_df[cat_cols])
        
    #Use mean across the numerical columns
    final_df[final_df.columns[~final_df.columns.isin(cat_cols)]] = np.mean(np.array([df[df.columns[~df.columns.isin(cat_cols)]].values for df in imputed_dfs]), axis=0)

    return final_df

# 4. Now, lets do some data imputations
- Do any preprocessesing of the dataframes, to identify different column dtypes
- specify a function for saving out the imputed data
- Based on previous notebooks, we will do two imputations
    - simple imputation for columns in the 'F_1' and 'F_3' blocks
    - model-based imputation for columns in the 'F_4' block

In [None]:
def make_submission(df, results):

    #df is the original df
    #results is the df of the imputation with column names
    #file str is the file name (minus the .csv)
    
    df_num = df.select_dtypes(include='number')
    df_num_col = df_num.columns

    submission_df = pd.DataFrame(columns=['row-col', 'value'])
    sub_dict = {}
    d_index = 0
    for col in df.columns:    
        null_index = df[df[col].isnull()].index.tolist()
        for i in null_index:
            cell_id = '-'.join([str(i), col])
            value = results[col][i]
            sub_dict[d_index] = {'row-col': cell_id, 'value': value}
            d_index+=1

    submission_df = pd.DataFrame.from_dict(sub_dict, orient='index')

    return submission_df

In [None]:
df = specify_categoricals(df)

## 4(a) Using each of the Different Imputers

### Simple Impute
------------------------
Simple Impute results: ~1.41708

In [None]:
simple_impute_df = simple_impute(df)

In [None]:
simple_df_results = make_submission(df, simple_impute_df)
simple_df_results.to_csv("simple_impute_submission.csv", index=False)

### k-NN Impute
-----------------------
***note, this one can take a long time to run***. You can also easily craft and IterativeImputer from this with Sci-kit learn's imputers.

In [None]:
#knn_impute_df = knn_impute(df)

In [None]:
#knn_impute_df_results = make_submission(df, knn_impute_df)
#knn_impute_df_results.to_csv("knn_impute_submission.csv", index=False)

### Imputation by Machine Learning Model
------------------------------
LighGBM Boost Impute with standard settings (~0.91645)

In [None]:
#boost_impute_df = ML_impute(df)

In [None]:
#boost_df_results = make_submission(df, boost_impute_df)
#boost_df_results.to_csv("basic_lgbm_impute_submission.csv", index=False)

## 4(b) Using a combination of imputers on different columns

We will use this code to create the submission the usae of a combination of imputers was inspired by [this notebook](https://www.kaggle.com/code/djustin/mean-and-lgb/notebook?scriptVersionId=97493929), [this notebook](https://www.kaggle.com/code/mirenaborisova/tps-june-22-simpleimputer-lgbm-lb-0-87540/notebook?scriptVersionId=97680406), and [this notebook](https://www.kaggle.com/code/abdulravoofshaik/top-3-solution-lgbm-mean/notebook?scriptVersionId=97501106). Please consider giving their notebooks a look (and an upvote).

In [None]:
# Create a working df with the imputations and just carry forward the 'F_2' block, since no imputation is needed there
F_2_cols = [i for i in list(df.columns) if i.split('_')[1] in ['2']]
working_df = df[df.columns[df.columns.isin(F_2_cols)]]

### Simple Impute Step
Simple imputations for those columns in F_1 and F_3 blocks

In [None]:
F_1_and_3_cols = [i for i in list(df.columns) if i.split('_')[1] in ['1', '3']]
simple_impute_df = simple_impute(df[df.columns[df.columns.isin(F_1_and_3_cols)]])

In [None]:
# Add in the simple imputations from just the F_1 and F_3 blocks to the working df
working_df = working_df.merge(simple_impute_df, left_index=True, right_index=True)

### Model-based (Ensemble) Impute Step
Model-based imputations for columns in the F_4 block

In [None]:
# Add in the F_4 columns to the working_df that already has imputations for all of the other blocks, so that we can use previous imputations in determing F_4's imputations

F_4_cols = [i for i in list(df.columns) if i.split('_')[1] in ['4']]
working_df = working_df.merge(df[df.columns[df.columns.isin(F_4_cols)]], left_index=True, right_index=True)

In [None]:
# Add in the number of missing cells per record as a feature

working_df['total_na_counts'] = df.isnull().sum(axis=1)
f_4_df = df[df.columns[df.columns.isin([i for i in list(df.columns) if i.split('_')[1] in ['4']])]]
working_df['F_4_na_counts']= f_4_df.isnull().sum(axis=1)

In [None]:
lgb_params = {
    'lambda_l1': 1,
    'lambda_l2': 1,
    'bagging_freq': 1,
    'bagging_fraction': 0.7,
    'verbose':-1
}

In [None]:
working_df = ensemble_impute(working_df, params=lgb_params)

In [None]:
final_df = partial_working_df[partial_working_df.columns[partial_working_df.columns.isin(df.columns)]]

In [None]:
# Finally, submit results of the imputation

df_results = make_submission(df, final_df)
df_results.to_csv("submission.csv", index=False)

As you can see form the results, a ML-model based imputer tends to give the best results for this type of missing data situation. However, also including simple imputation for some of the columns will give better results. There are some future avenues to investigate based on this:
- changing the order in which the ML-model imputer sees the columns
- changing the ML model in the imputer, or its settings
- trying some feature engineering or imputation across the 'F' block of features

***Please don't hestitate to post any comments or questions, and consider upvoting***