In [1]:
import torch
import os
import pickle
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import pandas as pd
import xgboost as xgb

from tqdm import tqdm
from time import time
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import accuracy_score

### Global Variables

In [2]:
# Global variables
RESULTS_PATH = './training_results/kernel_ridge'

VALID_SIZE = 2000

TRAIN_SIZE = 5000
TRAIN_CLASS_1_RATIO = 0.5

### **Load and Preprocess** Data

In [6]:
def preprocess_dataset(data: pl.DataFrame) -> pl.DataFrame:
    # Encode Gender
    enc = OneHotEncoder(handle_unknown='error', sparse_output=False)
    gender_oh_encoded = enc.fit_transform(data['Gender'].to_numpy().reshape(-1, 1))

    # Encode Vehicle_Age
    enc = OrdinalEncoder(categories=[['< 1 Year', '1-2 Year', '> 2 Years']], handle_unknown='error')
    vehicle_age_encoded = enc.fit_transform(data['Vehicle_Age'].to_numpy().reshape(-1, 1))

    # Encode Vehicle_Damage
    enc = OneHotEncoder(handle_unknown='error', sparse_output=False)
    vehicle_damage_encoded = enc.fit_transform(data['Vehicle_Damage'].to_numpy().reshape(-1, 1))

    # Standardize variables
    scaler = StandardScaler()
    age_standard = scaler.fit_transform(data['Age'].to_numpy().reshape(-1, 1))
    annual_premium_standard = scaler.fit_transform(data['Annual_Premium'].to_numpy().reshape(-1, 1))
    vintage_standard = scaler.fit_transform(data['Vintage'].to_numpy().reshape(-1, 1))

    data = data.drop(['Age', 'Annual_Premium', 'Vintage', 'Gender', 'Vehicle_Age', 'Vehicle_Damage']).with_columns(
        [
            pl.Series('Female', values=gender_oh_encoded[:, 0]),
            pl.Series('Male', values=gender_oh_encoded[:, 1]),
            pl.Series('Age', values=age_standard[:, 0]),
            pl.Series('Annual_Premium', values=annual_premium_standard[:, 0]),
            pl.Series('Vintage', values=vintage_standard[:, 0]),
            pl.Series('Vehicle_Age', values=vehicle_age_encoded[:, 0]),
            pl.Series('No_vehicle_damage', values=vehicle_damage_encoded[:, 0]),
            pl.Series('Vehicle_Damage', values=vehicle_damage_encoded[:, 1])
        ]
    )
    return data


In [7]:
def get_validation_split(
        data_only_0: pl.DataFrame,
        data_only_1: pl.DataFrame,
        size: int
) -> tuple[pl.DataFrame]:
    # Balance out dataset
    num_y_1 = size * 50 // 100
    num_y_0 = size - num_y_1
    
    # Select validation slices
    data_only_1_valid = data_only_1[:num_y_1]
    data_only_0_valid = data_only_0[:num_y_0]

    # Remove validation slices from the whole set (avoid overlapping with training set)
    data_only_1 = data_only_1[num_y_1:]
    data_only_0 = data_only_0[num_y_0:]

    data_valid = pl.concat(
        [
            data_only_1_valid,
            data_only_0_valid
        ],
        how='vertical'
    ).sample(frac=1, shuffle=True, seed=83409)

    assert data_valid.filter(pl.col('Response') == 1).shape[0] == data_valid.filter(pl.col('Response') == 0).shape[0]

    return data_valid, data_only_0, data_only_1

In [8]:

def get_training_split(
        data_only_0: pl.DataFrame, 
        data_only_1: pl.DataFrame, 
        size: int, 
        class_1_ratio: float,
        seed: int
) -> tuple[pl.DataFrame]:
    num_y_1 = int(size * class_1_ratio)
    num_y_0 = size - num_y_1

    data_train = pl.concat(
        [
            df_y_1[:num_y_1],
            df_y_0[:num_y_0]
        ],
        how='vertical'
    ).sample(frac=1, shuffle=True, seed=seed)

    # Remove training slices from the whole set (avoid future overlapping)
    data_only_1 = data_only_1[num_y_1:]
    data_only_0 = data_only_0[num_y_0:]

    return data_train, data_only_0, data_only_1

In [9]:
df = pl.read_csv('./health-insurance-data/train.csv')
df = df.sample(frac=1, shuffle=True, seed=68123)

# Preprocess Dataset
df = preprocess_dataset(data=df)

# Split dataset based on the binary label, then shuffle
df_y_0 = df.filter(pl.col('Response') == 0).sample(frac=1, shuffle=True, seed=11897)
df_y_1 = df.filter(pl.col('Response') == 1).sample(frac=1, shuffle=True, seed=4199)

size = 10000

num_y_1 = int(size * 0.5)
num_y_0 = size - num_y_1

data_train = pl.concat(
    [
        df_y_1[:num_y_1],
        df_y_0[:num_y_0]
    ],
    how='vertical'
).sample(frac=1, shuffle=True, seed=41212636)

data_train.write_csv('train_deea.csv')

In [None]:
df = pl.read_csv('train_deea.csv')

In [108]:
df = pl.read_csv('./health-insurance-data/train.csv')
df = df.sample(frac=1, shuffle=True, seed=68123)

# Preprocess Dataset
df = preprocess_dataset(data=df)
print(f'Columns after preprocessing: {df.columns}')

# Split dataset based on the binary label, then shuffle
df_y_0 = df.filter(pl.col('Response') == 0).sample(frac=1, shuffle=True, seed=11897)
df_y_1 = df.filter(pl.col('Response') == 1).sample(frac=1, shuffle=True, seed=4199)

valid_size = VALID_SIZE
# Get validation split and remove it from dataset to avoid overlapping with training set
df_valid, df_y_0, df_y_1 = get_validation_split(data_only_0=df_y_0, data_only_1=df_y_1, size=valid_size)

train_size = TRAIN_SIZE
# Select unbalanced training set
df_train, df_y_0, df_y_1= get_training_split(
    data_only_0=df_y_0,
    data_only_1=df_y_1, 
    size=train_size, 
    class_1_ratio=TRAIN_CLASS_1_RATIO, 
    seed=4128211
)

Columns after preprocessing: ['id', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Policy_Sales_Channel', 'Response', 'Female', 'Male', 'Age', 'Annual_Premium', 'Vintage', 'Vehicle_Age', 'No_vehicle_damage', 'Vehicle_Damage']


Select matrices for model

In [109]:
X_train = df_train.select(pl.exclude(['id', 'Response'])).to_numpy()
y_train =df_train['Response'].to_numpy().reshape(-1, 1)
X_valid = df_valid.select(pl.exclude(['id', 'Response'])).to_numpy()
y_valid = df_valid['Response'].to_numpy().reshape(-1, 1)

In [110]:
X_train.shape, y_train.shape

((5000, 12), (5000, 1))

In [111]:
X_valid.shape, y_valid.shape

((2000, 12), (2000, 1))

### **Linear Regression** Model

In [112]:

start = time()

krr = KernelRidge(alpha=1.0, kernel='linear')
krr.fit(X_train, y_train)

pred_values = krr.predict(X_valid)
pred_labels = np.round(pred_values)

accuracy = accuracy_score(y_valid, pred_labels)
print(f'LRR Acc = {accuracy:.4f}')
print(f'Time taken = {(time() - start):.3f}s')

LRR Acc = 0.7855
Time taken = 2.068s


### **Kernel Ridge Regression** Model

In [113]:
start = time()

krr = KernelRidge(alpha=1.0, kernel='rbf')
krr.fit(X_train, y_train)

pred_values = krr.predict(X_valid)
pred_labels = np.round(pred_values)

accuracy = accuracy_score(y_valid, pred_labels)
print(f'KRR Acc = {accuracy:.4f}')
print(f'Time taken = {(time() - start):.3f}s')

KRR Acc = 0.7695
Time taken = 5.025s


### **XGBoost** Model

In [114]:
start = time()

xgb_model = xgb.XGBRegressor(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)

pred_values = xgb_model.predict(X_valid)
pred_labels = np.round(pred_values)

accuracy = accuracy = accuracy_score(y_valid, pred_labels)
print(f'XGBoost Acc = {accuracy:.4f}')
print(f'Time taken = {(time() - start):.3f}s')

XGBoost Acc = 0.7750
Time taken = 0.482s


### Repeat 5 times

In [116]:
comp_time = {
    'lrr': [],
    'krr': [],
    'xgb': []
}
for i in range(0, 5):
    start = time()

    # Linear Ridge
    krr = KernelRidge(alpha=1.0, kernel='linear')
    krr.fit(X_train, y_train)

    pred_values = krr.predict(X_valid)
    pred_labels = np.round(pred_values)

    accuracy = accuracy_score(y_valid, pred_labels)
    delta_time = time() - start
    print(f'LRR Acc = {accuracy:.4f}')
    print(f'Time taken = {delta_time:.3f}s')
    comp_time['lrr'].append(delta_time)

    # RBF Kernel Ridge
    start = time()
    krr = KernelRidge(alpha=1.0, kernel='linear')
    krr.fit(X_train, y_train)
    pred_values = krr.predict(X_valid)
    pred_labels = np.round(pred_values)
    accuracy = accuracy_score(y_valid, pred_labels)
    delta_time = time() - start
    print(f'LRR Acc = {accuracy:.4f}')
    print(f'Time taken = {(time() - start):.3f}s')
    comp_time['krr'].append(delta_time)

    # XGBoost
    start = time()
    xgb_model = xgb.XGBRegressor(objective="binary:logistic")
    xgb_model.fit(X_train, y_train)
    pred_values = xgb_model.predict(X_valid)
    pred_labels = np.round(pred_values)
    accuracy = accuracy = accuracy_score(y_valid, pred_labels)
    print(f'XGBoost Acc = {accuracy:.4f}')
    print(f'Time taken = {(time() - start):.3f}s')
    comp_time['xgb'].append(delta_time)

LRR Acc = 0.7855
Time taken = 2.780s
LRR Acc = 0.7855
Time taken = 2.301s
XGBoost Acc = 0.7750
Time taken = 0.464s
LRR Acc = 0.7855
Time taken = 2.098s
LRR Acc = 0.7855
Time taken = 2.018s
XGBoost Acc = 0.7750
Time taken = 0.463s
LRR Acc = 0.7855
Time taken = 2.367s
LRR Acc = 0.7855
Time taken = 2.842s
XGBoost Acc = 0.7750
Time taken = 0.777s
LRR Acc = 0.7855
Time taken = 2.702s
LRR Acc = 0.7855
Time taken = 2.522s
XGBoost Acc = 0.7750
Time taken = 0.530s
LRR Acc = 0.7855
Time taken = 2.451s
LRR Acc = 0.7855
Time taken = 2.506s
XGBoost Acc = 0.7750
Time taken = 0.534s


In [117]:
np.mean(comp_time['lrr']), np.mean(comp_time['krr']), np.mean(comp_time['xgb'])

(2.479885149002075, 2.437330627441406, 2.437330627441406)