In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# September 2021 Tabular Playground
This notebook aims to show an entire workflow of Data Science by using the dataset from September 2021 Tabular Playground. A competitive performance is ensured.
* [**Author**](https://www.linkedin.com/in/chi-wang-22a337207/)
* [**Dataset**](https://www.kaggle.com/c/tabular-playground-series-sep-2021/data)

# Key Findings
* Generate a feature of **missing value number of each record** can help to significantly improve the performance.(Because of the scenario of the data: whether a customer made a claim upon an insurance policy, missing information seems doubtful)
* Data with **appropriate group imputation** are better than do noting with the missing value. The group key in this dataset is whether there is missing value of a record (**missing_sign**).
* It can be seen that if I use the label("claim") as the group imputation key, the performance will reach to about 0.9 AUC(I didn't show this in the notebook), although it is not allowed because of the data leakage. However, it indicates that find or generate highly related features to label can help to improve the performance a lot. ---**The importance of feature**
* Calculate **mutual information** or simplely visualize each feature can find several insights before modelling. Compare these results with the results from feature importance.
* Use the **aggregate results** from the chosen model can help to slightly improve the final results.(If the performance of the chosen models are quite competitive). Eg. CatBoost and LightGBM in this notebook.

# Issues
* Broader range of parameter tuning sets are required. (The limitation of Kaggle's resource)
* Could try to generate more important feature to boost the performance.
* Efficient way to find the optimal parameter

# Reference(Thanks for Inspiration)
* [TPS September 2021 EDA](https://www.kaggle.com/dwin183287/tps-september-2021-eda)
* [Takeaways from TPS Sep 2021](https://www.kaggle.com/c/tabular-playground-series-sep-2021/discussion/274966)

# Table of Content
1. [Data Overview](#1)
    * [1. Load Data](#1.1)
    * [2. Data Type](#1.2)
    * [3. Statistical View](#1.3)
2. [Data Preprocessing](#2)
    * [1. Drop Irrelevant Columns](#2.1)
    * [2. Missing Value Detection](#2.2)
    * [3. New Feature Generation](#2.3)
    * [4. Data Imputation](#2.4)
        * [1. Drop all NaN/NA/null](#2.4.1)
        * [2. Median Imputation](#2.4.2)
        * [3. Mean Imputation](#2.4.3)
3. [Data Analysis](#3)
    * [1. What is the distribution of claim? ](#3.1)
    * [2. What is the distribution of features on claim? ](#3.2)
4. [Feature Engineering](#4)
    * [1. Correlation Analysis ](#4.1)
    * [2. Mutual Information ](#4.2)
5. [Modelling](#5)
    * [1. Train Test Split ](#5.1)
    * [2. Train Models ](#5.2)
        * [0. Manually Prediction ](#5.2.0)
        * [1. XGboost ](#5.2.1)
        * [2. CatBoost ](#5.2.2)
        * [3. LightGBM ](#5.2.3)
    * [3. Model Comparison ](#5.3)
    * [4. Best Model Explaination ](#5.4)
    * [5. Parameter/Feature Tuning ](#5.5)
6. [Prediction](#6)
    * [1. Load Data](#6.1)
    * [2. Drop Irrelevant Columns](#6.2)
    * [3. New Feature Generation](#6.3)
    * [4. Data Imputation](#6.4)
    * [5. Make Prediction](#6.5)
    * [6. Save the Prediction to CSV file](#6.6)

<a id="1"></a>
# 1. Data Overview

In [None]:
# Import packages
import time
import gc

## Basic data processing
import numpy as np
import pandas as pd

## Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

## Modelling
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier, plot_importance

## Model Explanatory
import shap  # package used to calculate Shap values
import eli5

## Settings
pd.set_option('display.max_columns', 500) # Able to display more columns.
pd.set_option('display.max_info_columns', 150) # Able to display more columns in info().

<a id="1.1"></a>
## 1.1. Load Data

In [None]:
# Load the dataset
data_df = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
data_df.info() # show entries, dtypes, memory useage.

In [None]:
# Have a look
data_df.head(5)

* int64: id and claim
* float64: all other features

<a id="1.2"></a>
## 1.2. Data Type

> [NOIR](https://www.questionpro.com/blog/nominal-ordinal-interval-ratio/): Nominal, Ordinal, Interval, Ratio.  

As the features in this dataset have been anonymized, we just assume that the data type of each feature is what it looks like. --**Ratio**

<a id="1.3"></a>
## 1.3. Statistical View 

In [None]:
# Basic statistic on labels
data_df["claim"].astype("object").describe() # All the Nominal data can be treated as "object" type for simplicity.

There are two classes in label(claim), quite balanced.

In [None]:
# Basic statistic on features
data_df.loc[:, ~data_df.columns.isin(["claim"])].describe()

It is not very intuitive to see any insights from the pure statistic results. However, the range of several feature are quite huge that need to be noticed.  
f9, f12, f26, f27, f35, f62, f73, f74, f82, f86, f98, f108, f116

<a id="2"></a>
# 2. Data Preprocessing

<a id="2.1"></a>
## 2.1. Drop Irrelevant Columns

In [None]:
# Irrelevant columns
'''
id: id is useless for analysis and modeling.
'''
irrelevant_columns = ['id']
data_preprocessed_df = data_df.drop(irrelevant_columns, axis=1)
data_preprocessed_df

<a id="2.2"></a>
## 2.2. Missing Value Detection

In [None]:
# Replace the empty data with NaN
data_preprocessed_df.replace("", float("NaN"), inplace=True)
data_preprocessed_df.replace(" ", float("NaN"), inplace=True)

# Count missing value(NaN, na, null, None) of each columns, Then transform the result to a pandas dataframe. 
count_missing_value = data_preprocessed_df.isna().sum() / data_preprocessed_df.shape[0] * 100
count_missing_value_df = pd.DataFrame(count_missing_value.sort_values(ascending=False), columns=['Missing%'])

In [None]:
# Visualize the percentage(>0) of Missing value in each column.
missing_value_df = count_missing_value_df[count_missing_value_df['Missing%'] > 0]

plt.figure(figsize=(10, 20)) # Set the figure size
missing_value_graph = sns.barplot(y = missing_value_df.index, x = "Missing%", data=missing_value_df, orient="h")
missing_value_graph.set_title("Percentage Missing Value", fontsize = 20)
missing_value_graph.set_xlabel("Features")

It seems there aren't many missing value in each feature. However, almost all the features have missing value **~=1.6%**.

<a id="2.3"></a>
## 2.3. New Feature Generation

In [None]:
# Generate a new feature from the missing value of each records, it is inspired by the scenario of the dataset.(whether a customer made a claim upon an insurance policy)
# The less the information of the user, the higher risk of the user.
data_preprocessed_df['missing_num'] = data_preprocessed_df.isnull().sum(axis=1)

In [None]:
# Generate a new feature by counting the negative value of each records.
data_preprocessed_df['neg_num'] = (data_preprocessed_df < 0).sum(axis=1)

In [None]:
# Generate a new feature from the missing_num.(Binary value to represent whether there is missing value in this record)
data_preprocessed_df['missing_sign'] = data_preprocessed_df['missing_num'] != 0
data_preprocessed_df['missing_sign'] = data_preprocessed_df['missing_sign'] + 0

<a id="2.4"></a>
## 2.4. Data Imputation
> Choose the suitable imputation tech which can highly represent the central tendency of the data.

<a id="2.4.1"></a>
### 2.4.1. Drop all NaN/NA/null 

In [None]:
# Drop all the instance with NaN/NA/null
data_preprocessed_dropNaN_df = data_preprocessed_df.dropna().copy()
data_preprocessed_dropNaN_df.reset_index(drop=True, inplace=True)

In [None]:
# The percentage of rows with missing value
(data_preprocessed_df.shape[0] - data_preprocessed_dropNaN_df.shape[0]) / data_preprocessed_df.shape[0] * 100

It seems that we can't drop missing value directly. Although the missing value in each feature is quite few, it randomly appeared in each row. Therefore, there are almost 40% rows have missing value.

<a id="2.4.2"></a>
### 2.4.2. Median Imputation 

In [None]:
# Get the feature names with missing values.
missing_features = list(missing_value_df.index)

In [None]:
# Grouped Median imputation by a feature which is highly related to label(claim): missing_sign
data_preprocessed_median_df = data_preprocessed_df.copy()
for feature in missing_features: 
    data_preprocessed_median_df[feature] = data_preprocessed_df.groupby('missing_sign', sort=False)[feature].apply(lambda x: x.fillna(x.median()))

<a id="2.4.3"></a>
### 2.4.3. Mean Imputation 

In [None]:
# Grouped Mean imputation by a feature which is highly related to label(claim): missing_sign
data_preprocessed_mean_df = data_preprocessed_df.copy()
for feature in missing_features:
    data_preprocessed_mean_df[feature] = data_preprocessed_df.groupby('missing_sign', sort=False)[feature].apply(lambda x: x.fillna(x.mean()))

Then, we get 4 datasets:
* **data_preprocessed_df**: The original data without imputation.
* **data_preprocessed_dropNaN_df**: Simply drop all the ***NaN*** value.
* **data_preprocessed_median_df**: Grouped Median imputation on ***claim***.
* **data_preprocessed_mean_df**: Grouped Mean imputation on ***claim***.

In [None]:
# Set one of the dataset for analysis and modeling; Choose the one with the best performance at last.
#data_best_df = data_preprocessed_df.copy()
#data_best_df = data_preprocessed_dropNaN_df.copy()
#data_best_df = data_preprocessed_median_df.copy()
data_best_df = data_preprocessed_mean_df.copy()

In [None]:
data_best_df

<a id="3"></a>
# 3. Data Analysis

<a id="3.1"></a>
## 3.1. What is the distribution of claim?

In [None]:
# Count the number of claim(0/1), transform the result to pandas dataframe
claim_counts = data_best_df["claim"].value_counts()
claim_counts_df = pd.DataFrame(claim_counts)

In [None]:
# Visualize the distribution of the claim(label)
claim_fig = make_subplots(
    rows=1, cols=2, 
    specs=[[{"type": "xy"}, {"type": "domain"}]])

claim_fig.add_trace(go.Bar(x=claim_counts_df.index, 
                           y=claim_counts_df["claim"],
                           text=claim_counts_df["claim"],
                           textposition='outside',
                           showlegend=False),
                           1, 1)

claim_fig.add_trace(go.Pie(labels=claim_counts_df.index, 
                           values=claim_counts_df["claim"],
                           showlegend=True),
                           1, 2)

claim_fig.update_layout(
                  height=600, 
                  width=1000,
                  title={
                  'text': "The distribution of claim",
                  'font': {'size': 24},
                  'y':0.95,
                  'x':0.5,
                  'xanchor': 'center',
                  'yanchor': 'top'},
                  xaxis1_title = 'claim', 
                  yaxis1_title = 'Counts',
                  legend_title_text="claim"
                 )
claim_fig.update_xaxes(type='category')
claim_fig.show()

The label(claim) is quite balanced.

<a id="3.2"></a>
## 3.2. What is the distribution of features on claim?

In [None]:
# Set up the matplotlib figure
#f, axes = plt.subplots(2, 6, figsize=(30, 10)) #suitable for two line with 6 graph.

f, axes = plt.subplots(20, 6, figsize=(30, 100))
for feature,number in zip(data_best_df.columns, range(118)):
    yaxix_name = 'f'+str(number+1)
    r_pos = number // 6
    c_pos = number % 6
    sns.boxplot(x='claim', y=yaxix_name, data=data_best_df, ax=axes[r_pos, c_pos]).set_title(yaxix_name)

There are not any feature have an significant difference on claim. If we have to say the slight different on claim, the features are: **f3, f5, f8, f9, f21, f31, f34, f38, f45, f57**.

In [None]:
# Plot features we created
new_features=['missing_num','neg_num', 'missing_sign']
f, axes = plt.subplots(2, 2, figsize=(15, 10)) #suitable for two line with 6 graph.
for feature,number in zip(new_features, range(4)):
    yaxix_name = feature
    r_pos = number // 2
    c_pos = number % 2
    sns.boxplot(x='claim', y=yaxix_name, data=data_best_df, ax=axes[r_pos, c_pos]).set_title(yaxix_name)

It proves that feature "**missing_num**" is important to label("claim"), so as "**missing_sign**".

<a id="4"></a>
# 4. Feature Engineering
As all the features of the dataset are not categorical, there is no need to do with encoding. However, there are more than 100 features in this dataset, it is better to do **dimension reduction**.

<a id="4.1"></a>
## 4.1. Correlation Analysis

In [None]:
# Show the heatmap
plt.figure(figsize=(160, 120))
sns.heatmap(data_best_df.drop('claim', axis=1).corr(), cmap="coolwarm", annot = True, fmt='.3f').set_title('Pearson Correlation for continuous features', fontsize=22)
plt.show()

We can not see any significant correlation between each feature.  
Note: The correlation between "missing_num" and "missing_sign" is quite high. Because "missing_sign" comes from  "missing_num", which is used as the group imputation key.

<a id="4.2"></a>
## 4.2. Mutual Information

In [None]:
# Calculate the mutual information of the dataset(Require No NaN)
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
mi_scores = make_mi_scores(data_best_df, data_best_df['claim'], False)
mi_scores[:10]  # show a few features with their MI scores

Important feature discovery:
1. Visualization: **f3, f5, f8, f9, f21, f31, f34, f38, f45, f57, missing_num**
2. Mutual information: **missing_sign, missing_num, neg_num, f70, f11, f78, f52, f75, f118, f40**

<a id="5"></a>
# 5. Modelling

* **data_preprocessed_df**: The original data without imputation.
* **data_preprocessed_dropNaN_df**: Simply drop all the ***NaN*** value.
* **data_preprocessed_median_df**: Grouped Median imputation on ***claim***.
* **data_preprocessed_mean_df**: Grouped Mean imputation on ***claim***.

<a id="5.1"></a>
## 5.1. Train Test Split

In [None]:
# Choose a dataset for modelling
data_modelling_df = data_preprocessed_mean_df.copy()

In [None]:
# Train/Test Split
X = data_modelling_df.drop("claim", axis=1)
Y = data_modelling_df.claim
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state=0)

In [None]:
# Define the evaluation dataset
eval_sets=[(x_train, y_train), (x_test, y_test)]

<a id="5.2"></a>
## 5.2. Train Models
> Let's use three state-of-art ensembled models to make prediction

* [XGBoost](https://xgboost.readthedocs.io/en/latest/)
* [CatBoost](https://catboost.ai/)
* [LightGBM](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html)

<a id="5.2.0"></a>
### 5.2.0. Manually Prediction 

In [None]:
# Use the missing_sign to simplely represent "claim"
print(f'Train AUC: {roc_auc_score(y_train, x_train["missing_sign"])}')
print(f'Test AUC: {roc_auc_score(y_test, x_test["missing_sign"])}')

<a id="5.2.1"></a>
### 5.2.1 XGboost

In [None]:
# Start time
start_time = time.time()
xgbc = XGBClassifier(random_state=0, use_label_encoder=False)
xgbc.fit(x_train, y_train, eval_set=eval_sets, eval_metric='auc', verbose=False)

# Calculate the training time
xgbc_time = time.time() - start_time

# xgbc.evals_result() #Return the evaluation results of eval_sets
predictions = xgbc.predict_proba(x_test)[:,1]
auc_xgbc = roc_auc_score(y_test, predictions)
print(f'AUC: {auc_xgbc}')

<a id="5.2.1"></a>
### 5.2.1 CatBoost

In [None]:
# Start time
start_time = time.time()
catbc = CatBoostClassifier(random_state=0, eval_metric='AUC')
catbc.fit(x_train, y_train, eval_set=eval_sets, verbose=False)

# Calculate the training time
catbc_time = time.time() - start_time
predictions = catbc.predict_proba(x_test)[:,1]
auc_catbc = roc_auc_score(y_test, predictions)
print(f'AUC: {auc_catbc}')

<a id="5.2.3"></a>
### 5.2.3 LGBM

In [None]:
# Start time
start_time = time.time()

lgbc = LGBMClassifier(random_state=0)
lgbc.fit(x_train, y_train, eval_set=eval_sets, eval_metric='auc', verbose=-1)

# Calculate the training time
lgbc_time = time.time() - start_time
predictions = lgbc.predict_proba(x_test)[:,1]
auc_lgbc = roc_auc_score(y_test, predictions)
print(f'AUC: {auc_lgbc}')

<a id="5.3"></a>
## 5.3. Model Comparison

In [None]:
# Collect all the model performance
model_comparison = pd.DataFrame(data = [(auc_xgbc, xgbc_time), (auc_catbc, catbc_time), (auc_lgbc, lgbc_time)], 
                                index = ["XGboost", "CatBoost", "LGBM"],
                                columns=['AUC', 'Time'])\
                     .sort_values(by = "AUC", ascending=False)
model_comparison

It seems that **CatBoost** model outperform all the chosen models. **LGBM** performs a little bit worse, but the training time is quite short.

<a id="5.4"></a>
## 5.4. Best Model Explaination

In [None]:
# Save the feature importance as a dataframe  
catbc_importances_df = pd.DataFrame(pd.Series(catbc.feature_importances_, index=x_train.columns), columns=['Importance']).sort_values('Importance', ascending=False)[:10]
# Visualize the feature importance of the trained tree
plt.figure(figsize=(10, 10))
feature_importance_graph = sns.barplot(y = catbc_importances_df.index, x = "Importance", data=catbc_importances_df, orient="h")
feature_importance_graph.set_title("Top 10 Feature importance by CatBoost Classification", fontsize = 20)
feature_importance_graph.set_ylabel("Features")
# Use eli5 to show the value of feature importance with colors
eli5.show_weights(catbc, feature_names = list(x_train.columns))

The top 10 important feature are: **missing_num, missing_sign, f40, f70, f47, f57, f35, f1, f106, f5**.

In [None]:
# Shap
explainer = shap.TreeExplainer(catbc)
shap_values = explainer.shap_values(x_train)
shap.summary_plot(shap_values, x_train)

<a id="5.5"></a>
## 5.5. Parameter/Feature Tuning  

In [None]:
data_lst = [data_preprocessed_df, data_preprocessed_dropNaN_df, data_preprocessed_median_df, data_preprocessed_mean_df]

In [None]:
tuning_res = []

for dataset_df in data_lst:
    # Train/Test Split
    X = dataset_df.drop("claim", axis=1)
    Y = dataset_df.claim
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state=0)
    
    # Start time
    start_time = time.time()
    catbc = CatBoostClassifier(random_state=0, eval_metric='AUC')
    catbc.fit(x_train, y_train, verbose=False)
    
    # Calculate the training time
    catbc_time = time.time() - start_time
    predictions = catbc.predict_proba(x_test)[:,1]
    auc_catbc = roc_auc_score(y_test, predictions)
    tuning_res.append((auc_catbc, catbc_time))

In [None]:
# Compare performance on different dataset
dataset_comparison = pd.DataFrame(data = tuning_res, 
                                index = ["data_preprocessed_df", "data_preprocessed_dropNaN_df", "data_preprocessed_median_df", "data_preprocessed_mean_df"],
                                columns=['AUC', 'Time'])\
                     .sort_values(by = "AUC", ascending=False)
dataset_comparison

It seems that data with mean value imputation outperform from all the chosen dataset.

In [None]:
# Release big variables that are not used in the following.
del data_preprocessed_df
del data_preprocessed_dropNaN_df
del data_preprocessed_median_df
del data_best_df
del data_df
del data_modelling_df
del x_train
del x_test

gc.collect()

In [None]:
# Use the dataset with the best performance
dataset_df = data_preprocessed_mean_df.copy()
X = dataset_df.drop("claim", axis=1)
Y = dataset_df.claim

In [None]:
del data_preprocessed_mean_df
del dataset_df
gc.collect()

In [None]:
# We can not really do the parameter tuning well on Kaggle because of the limitation of resource

# catbc_best = CatBoostClassifier(random_state=0, eval_metric='AUC')
# catbc_grid = {'max_depth': [4, 6, 8, 10],
#               'n_estimators':[100, 300, 500, 1000, 2000],
#               'min_data_in_leaf': [1, 3, 5, 10]
#              }

# catbc_search = GridSearchCV(catbc_best, 
#                             catbc_grid,
#                             scoring="roc_auc",
#                             n_jobs=-1,
#                             cv = 5)
# catbc_search.fit(X,Y, verbose=False)

# # Returns the estimator with the best performance
# print(catbc_search.best_estimator_)

# # Returns the best score
# print(catbc_search.best_score_)

# # Returns the best parameters
# print(catbc_search.best_params_)

In [None]:
# Train the competitive models on all the known dataset
catbc_best = CatBoostClassifier(random_state=0, eval_metric='AUC')
catbc_best.fit(X, Y, verbose=False)

lgbc_best = LGBMClassifier(random_state=0)
lgbc_best.fit(X, Y, eval_metric='auc', verbose=-1)

In [None]:
del X,Y
gc.collect()

Two best model Generated(on all training dataset):
1. catbc_best
2. lgbc_best

<a id="6"></a>
# 6. Prediction

<a id="6.1"></a>
## 6.1. Load Data

In [None]:
# Load the dataset
test_df = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
test_df.info() # show entries, dtypes, memory useage.

In [None]:
# Have a look
test_df.head()

<a id="6.2"></a>
## 6.2. Drop Irrelevant Columns

In [None]:
# Drop Irrelevant columns
test_preprocessed_df = test_df.drop(irrelevant_columns, axis=1)

<a id="6.3"></a>
## 6.3. New Feature Generation

In [None]:
# Add a new feature represent the missing value number of each row
test_preprocessed_df['missing_num'] = test_preprocessed_df.isnull().sum(axis=1)

In [None]:
# Generate a new feature by counting the negative value of each records.
test_preprocessed_df['neg_num'] = (test_preprocessed_df < 0).sum(axis=1)

In [None]:
# Generate a new feature from the missing_num.(Binary value to represent whether there is missing value in this record)
test_preprocessed_df['missing_sign'] = test_preprocessed_df['missing_num'] != 0
test_preprocessed_df['missing_sign'] = test_preprocessed_df['missing_sign'] + 0

<a id="6.4"></a>
## 6.4. Data Imputation

In [None]:
# Replace the empty data with NaN
test_preprocessed_df.replace("", float("NaN"), inplace=True)
test_preprocessed_df.replace(" ", float("NaN"), inplace=True)

# Count missing value(NaN, na, null, None) of each columns, Then transform the result to a pandas dataframe. 
count_missing_value = test_preprocessed_df.isna().sum() / test_preprocessed_df.shape[0] * 100
count_missing_value_df = pd.DataFrame(count_missing_value.sort_values(ascending=False), columns=['Missing%'])
missing_value_df = count_missing_value_df[count_missing_value_df['Missing%'] > 0]

# Get the feature names with missing values.
missing_features = list(missing_value_df.index)

In [None]:
# Group imputation
for feature in missing_features: 
    test_preprocessed_df[feature] = test_preprocessed_df.groupby('missing_sign', sort=False)[feature].apply(lambda x: x.fillna(x.mean()))

<a id="6.5"></a>
## 6.5. Make Prediction

In [None]:
# Use trained model(best) to make predictions
results1 = catbc_best.predict_proba(test_preprocessed_df)[:,1]
results_df1 = pd.DataFrame(results1, columns=['claim'])
#predictions_df1 = pd.concat([test_df['id'], results_df1], axis=1)

In [None]:
# Use trained model(best) to make predictions
results2 = lgbc_best.predict_proba(test_preprocessed_df)[:,1]
results_df2 = pd.DataFrame(results2, columns=['claim'])
#predictions_df2 = pd.concat([test_df['id'], results_df2], axis=1)

In [None]:
# Aggregate the results from two best models
aggregate_df = pd.concat([test_df['id'], results_df1, results_df2], axis=1)
aggregate_df['mean'] = aggregate_df['claim'].mean(axis=1)

final_df = aggregate_df[['id', 'mean']].copy()
final_df.rename(columns={'mean': 'claim'}, inplace=True)

<a id="6.6"></a>
## 6.6. Save the Prediction to CSV file

In [None]:
# Save predictions to .csv for project submission
final_df.to_csv('submission.csv', index=False)

# Thanks for reading, have a good day ~