# Kaggle Playground - Insurance Premium Regression

## Import libraries

In [None]:
pip install lightautoml

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import torch
import os

In [None]:
import pandas as pd              # For data manipulation and analysis
import numpy as np               # For numerical computing
from datetime import datetime
import scipy.stats as stats      # For statistical analysis
import math
import matplotlib                # For plotting and visualization
import matplotlib.pyplot as plt  
from pandas.plotting import parallel_coordinates
import seaborn as sns            # For statistical data visualization
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

## Import datasets

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')

## Feature Engineering

In [None]:
def extract_date_components(df, date_column):
    # Convert the column to datetime type
    df[date_column] = pd.to_datetime(df[date_column])
    # Extract Year, Month, and Day
    df['Year'] = df[date_column].dt.year
    df['Month'] = df[date_column].dt.month
    df['Day'] = df[date_column].dt.day
    # Drop the original column
    df.drop(columns=[date_column], inplace=True)
    return df

def log_transform(df):
    df['Log Annual Income'] = np.log(df['Annual Income']+1)
    # Drop the original column
    df.drop(columns=['Annual Income'], inplace=True)
    return df

for df in [df_train, df_test]:
    extract_date_components(df, 'Policy Start Date')
    log_transform(df)

In [None]:
num_cols = [col for col in df_test.columns if df_test[col].dtypes in ['float', 'int32']]
print('Numerical Features \n', num_cols)
cat_cols = [col for col in df_test.columns if df_test[col].dtypes in ['object']]
print('Categorical Features \n', cat_cols)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer
# Define preprocessing pipelines
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('convert_to_float32', FunctionTransformer(lambda x: x.astype(np.float32)))
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder())
])

# Combine the numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, num_cols),
        ('cat', categorical_pipeline, cat_cols)
    ]
)

preprocessor.set_output(transform="pandas")

In [None]:
# Apply the transformations to the training and validation sets
df_train_processed = preprocessor.fit_transform(df_train)
df_test_processed = preprocessor.transform(df_test)

## Log-transform Target - *Premium Amount*

In [None]:
df_train_automl = pd.concat([df_train_processed, df_train['Premium Amount']], axis=1)
df_train_automl

In [None]:
df_train_automl['Premium Amount'] = np.log(df_train_automl['Premium Amount'])
df_train_automl['Premium Amount']

## LightAutoML

In [None]:
def map_class(x, task, reader):
    if task.name == 'multiclass':
        return reader[x]
    else:
        return x

mapped = np.vectorize(map_class)

def score(task, y_true, y_pred):
    if task.name == 'binary':
        return roc_auc_score(y_true, y_pred)
    elif task.name == 'multiclass':
        return accuracy_score(y_true, np.argmax(y_pred, 1))
    elif task.name == 'reg' or task.name == 'multi:reg':
        return median_absolute_error(y_true, y_pred)
    else:
        raise 'Task is not correct.'
        
def take_pred_from_task(pred, task):
    if task.name == 'binary' or task.name == 'reg':
        return pred[:, 0]
    elif task.name == 'multiclass' or task.name == 'multi:reg':
        return pred
    else:
        raise 'Task is not correct.'
        
def use_plr(USE_PLR):
    if USE_PLR:
        return "plr"
    else:
        return "cont"

In [None]:
RANDOM_STATE = 42
N_THREADS = os.cpu_count()
TIMEOUT = 9 * 3600
N_FOLDS = 10
TARGET = 'Premium Amount'
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
# task = Task('reg') 
# automl = TabularAutoML(
#     task = task, 
#     timeout = 9 * 3600,
#     cpu_limit = os.cpu_count(),
#     nn_params = {
#     'stop_by_metric': True,
#     'verbose_bar': True},
#     nn_pipeline_params = {"use_qnt": False, "use_te": False},
#     reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE, 'advanced_roles': True}
# )

In [None]:
task = Task('reg') 
automl = TabularAutoML(
    task = task,
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    general_params = {"use_algos": [["nn"]]}, # ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint', 'fttransformer'] or custom torch model
    nn_params = {
        "n_epochs": 10, 
        "bs": 1024, 
        "num_workers": 0, 
        "path_to_save": None, 
        "freeze_defaults": True,
        "cont_embedder": 'plr',
        'cat_embedder': 'weighted',
        "hidden_size": 128,
        'hid_factor': [4, 4],
        'block_config': [4, 4],
        'embedding_size': 64, 
        'stop_by_metric': True,
        'verbose_bar': True,
        "snap_params": { 'k': 2, 'early_stopping': True, 'patience': 2, 'swa': True}
},
    nn_pipeline_params = {"use_qnt": True, "use_te": False},
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

In [None]:
out_of_fold_predictions = automl.fit_predict(
    df_train_automl,
    roles = {
        'target': 'Premium Amount',
        'drop': 'id'
    }, 
    verbose = 3
)

In [None]:
from sklearn.metrics import mean_squared_error
print(f'OOF score: {mean_squared_error(df_train_automl[TARGET].values, out_of_fold_predictions.data[:, 0])}') #1.1702855470203595

## Prediction

In [None]:
y_test_automl = automl.predict(df_test_processed).data[:, 0]
y_test_automl = np.expm1(y_test_automl)

## Ensemble

In [None]:
y_sub_1 = pd.read_csv('/kaggle/input/rid-train-h2o/submission.csv')
y_sub_2 = pd.read_csv('/kaggle/input/p04e12-blended-submission/submission.csv')
y_sub_3 = pd.read_csv('/kaggle/input/push-the-limit-of-blended-submission/submission.csv')

In [None]:
y_test_ensemble = y_sub_1.copy()
y_test_ensemble['Premium Amount'] = (
    (4/10) * y_sub_1['Premium Amount'] +
    (3/10) * y_sub_2['Premium Amount'] +
    (2/10) * y_sub_3['Premium Amount'] +
    (1/10) * y_test_automl
)

## Submission

In [None]:
df_sub = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')
df_sub['Premium Amount'] = y_test_ensemble['Premium Amount']
df_sub.to_csv('submission_automl.csv', index=False)
df_sub.head()