In this notebook I will read all the experiments recorded in MLFlow to create a hybrid using a linear combination of the best performing models in the low and the high range of the target variable.

In [1]:
import warnings
warnings.filterwarnings("ignore")


In [2]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import (
    FunctionTransformer,
    PowerTransformer,
    MinMaxScaler,
    OneHotEncoder,
    Binarizer,
    StandardScaler
)


from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.decomposition import PCA


import mlflow
import mlflow.sklearn

from matplotlib import pyplot as plt
import seaborn as sns
import optuna

In [3]:
# all MLFlow experiment ids
experiment_ids = [
    "345085265373754410", "534270053342576074", "876976438058871924",
    #"236979763080832103", 
    "385714677853406939", "795878298586783034",
    "261075903773380103", "446620739084012107", "829612150114310536"
]


all_runs = []
for exp_id in experiment_ids:
    runs = mlflow.search_runs(experiment_ids=exp_id)
    all_runs.append(runs)

# Combine all runs into a single DataFrame
df_runs = pd.concat(all_runs)

In [4]:
# Ensure numeric types for filtering
df_runs["cv_rmse_low"] = pd.to_numeric(df_runs["metrics.cv_rmse_low"], errors="coerce")
df_runs["cv_rmse_high"] = pd.to_numeric(df_runs["metrics.cv_rmse_high"], errors="coerce")

# Find best runs
best_low_run = df_runs.nsmallest(1, "cv_rmse_low")
best_high_run = df_runs.nsmallest(1, "cv_rmse_high")


Best performing models

In [5]:
# best model on low values 
print('Experiment: ', mlflow.get_experiment(best_low_run['experiment_id'].values[0]).name)
print('Run name: ', mlflow.get_run(best_low_run['run_id'].values[0]).data.tags['mlflow.runName'])

Experiment:  Optuna XGBoost
Run name:  XGBoost optimization


In [6]:
# best model on high values 
print('Experiment: ', mlflow.get_experiment(best_high_run['experiment_id'].values[0]).name)
print('Run name: ', mlflow.get_run(best_high_run['run_id'].values[0]).data.tags['mlflow.runName'])

Experiment:  Optuna LightGBM
Run name:  LightGBM optimization


In [7]:
print(best_low_run['params.regressor__linear_model'].values[0])

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.892038504358537, device=None,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=1.387236164146072,
             grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01892454893969501,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=3, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=200, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [8]:
# since the field is stripped I recover it from the notebook 
params_best_low_xgb = {
    'n_estimators': 200, 
    'max_depth': 6, 
    'learning_rate': 0.01892454893969501, 
    'min_child_weight': 3, 
    'subsample': 0.7690126267645642, 
    'colsample_bytree': 0.892038504358537, 
    'reg_alpha': 0.8349040782501462, 
    'reg_lambda': 6.202318676654988, 
    'gamma': 1.387236164146072
}


In [9]:
print(best_high_run['params.regressor__linear_model'].values[0])

LGBMRegressor(colsample_bytree=0.6720373519256779, force_col_wise=True,
              lambda_l1=1.1495720160400913, lambda_l2=5.392264763815639,
              learning_rate=0.18175310504801576, max_depth=3, metric='rmse',
              min_child_weight=38, n_estimators=900, num_leaves=173,
              objective='regression', subsample=0.5490561827387797)


In [10]:
params_best_high_lgbm = { 
    'colsample_bytree': 0.6720373519256779, 
    'force_col_wise': True,
    'lambda_l1': 1.1495720160400913, 
    'lambda_l2': 5.392264763815639,
    'learning_rate': 0.18175310504801576, 
    'max_depth': 3, 
    'metric': 'rmse',
    'min_child_weight': 38, 
    'n_estimators': 900, 
    'num_leaves': 173,
    'objective': 'regression', 
    'subsample': 0.5490561827387797
} 

### Training datasets and pipeline as usual 

In [11]:
dataset_train = pd.read_csv('../data/processed/dataset_train.csv', index_col=0)
dataset_test = pd.read_csv('../data/processed/dataset_test.csv', index_col=0)

# column names
cols_event_embeddings = [f'event_feat_{i}' for i in range(768)]
cols_event_dummies = ['BACK STRAIN', 'CLEANING INSTRUMENTS', 'CLEANING LEFT SHOULDER SPLINTER', 'CUT WITH KNIFE', 'DEALING CARDS RIGHT TENDON SYNOVITIS', 'FELL', 'FELL FROM LADDER', 'FELL OFF LADDER', 'FRACTURED LEFT WRIST FRACTURE', 'GRINDING FOREIGN BODY', 'GRINDING STEEL BEAM INFECTION', 'HIT AIR HOSE', 'HIT ELBOW', 'HIT FLOOR SHEET', 'INJURY', 'INJURY LIFTING', 'INJURY LIFTING STRAIN', 'JAMMED RIGHT HAND PUNCTURE', 'KNIFE LACERATION', 'KNIFE SLIPPED', 'LACERATION', 'LEFT LEG SNAKE BITE', 'LIFTING', 'LIFTING BOXES', 'LIFTING PATIENT', 'LIFTING TYRES', 'LOWER BACK STRAIN', 'MOTOR VEHICLE COLLISION', 'PREPARING PIPES FOR PAINTING PUNCTURE', 'PULLING CABLES', 'REDBACK SPIDER BITE', 'SLIPPED AND FELL', 'SLIPPED ON FLOOR', 'SLIPPED ON ROLLER', 'SLIPPED ON STAIRS', 'SLIPPED ON WET FLOOR', 'SLIPPED USING LATHE', 'SORTING ALUMINIUM BARS STRAIN', 'SPRAINED RIGHT ANKLE FRACTURE', 'STRAIN', 'STRAIN LIFTING PARTS', 'STRAIN LIFTING STRAIN', 'STRUCK KNIFE LACERATED', 'STRUCK KNIFE LACERATION', 'STRUCK PALLET', 'STRUCK WITH KNIFE', 'USING AIR HOSE STRAIN', 'USING DRILL']
cols_body_parts_embeddings = [f'body_feat_{i}' for i in range(768)]

In [12]:
log_transformer = Pipeline([
    ('log', FunctionTransformer(np.log, validate=False)),
    ('scaler', StandardScaler()) 
])

# gender to bool 
def gender_to_bool(gender_column):
    """
    - Fill missing values with 'M'
    - Replace 'U' with 'M'
    - Return a boolean-ish column: 1 for 'M', 0 otherwise
    """
    g = pd.Series(gender_column.squeeze(), dtype=str).fillna('M').replace('U', 'M')
    is_male = (g == 'M').astype(int)
    return is_male.values.reshape(-1, 1)

gender_transformer = FunctionTransformer(gender_to_bool, validate=False)

# hours worked per week -> to buckets -> to one_hot
def bucket_hours_worked(dtt_array):
    return pd.cut(
        dtt_array.squeeze(), 
        bins=[-np.inf, 37, 41, np.inf],
        labels=["<=37", "37-41", ">41"]
    ).astype(str).values.reshape(-1, 1)

hours_worked_bucketer = FunctionTransformer(bucket_hours_worked, validate=False)
hours_worked_encoder = OneHotEncoder(drop='first')
hours_worked_pipeline = Pipeline([
    ('bucketizer', hours_worked_bucketer),
    ('encoder', hours_worked_encoder)
])

# DaysToReport (DateReported - DateTimeOfAccident) -> to buckets -> to one_hot
def bucket_days_to_report(dtt_array):
    return pd.cut(
        dtt_array.squeeze(), 
        bins=[-np.inf, 80, 300, 500, np.inf],
        labels=["<=80", "80-300", "300-500", ">500"]
    ).astype(str).values.reshape(-1, 1)

days_to_report_bucketer = FunctionTransformer(bucket_days_to_report, validate=False)
days_to_report_encoder = OneHotEncoder(drop='first')
days_to_report_pipeline = Pipeline([
    ('bucketizer', days_to_report_bucketer),
    ('encoder', days_to_report_encoder)
])

# DaysWorkedPerWeek -> 1 if equals 5, 0 in any other case 
def days_worked_binarize(days_array):
    # Ensure we handle arrays or DataFrames by squeezing to 1D
    days = days_array.squeeze()
    binarized = (days == 5).astype(int)
    # Return as 2D array: (n_samples x 1)
    return binarized.values.reshape(-1, 1) if isinstance(days, pd.Series) else binarized.reshape(-1, 1)

days_worked_transformer = FunctionTransformer(days_worked_binarize, validate=False)

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('log_inc', log_transformer, ['InitialIncurredCalimsCost', 'WeeklyWages']),
        ('minmax_scaler', MinMaxScaler(), ['Age', 'YearAccident']),
        ('gender_bool', gender_transformer, ['Gender']),
        ('hww_bool_onehot', hours_worked_pipeline, ['HoursWorkedPerWeek']),
        ('dtt_bool_onehot', days_to_report_pipeline, ['DaysToReport']),
        ('has_dependent_bool', Binarizer(threshold=0), ['DependentChildren']),
        ('worked_five_days_bool', days_worked_transformer, ['DaysWorkedPerWeek']),
        ('onehot', OneHotEncoder(drop='first'), ['MaritalStatus', 'PartTimeFullTime']),
        ('pca_event_embeddings', PCA(n_components=250), cols_event_embeddings),
        ('passthrough_event_dummies', 'passthrough', cols_event_dummies),
        ('pca_body_parts_embeddings', PCA(n_components=80), cols_body_parts_embeddings),
        ('std_scaler', StandardScaler(), ['event_median_cost'])
    ],
    remainder='drop'
)

In [14]:
regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', LGBMRegressor(**params_best_high_lgbm))
])

best_model_high = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=PowerTransformer(method='box-cox', standardize=False)
)

In [15]:
regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', XGBRegressor(**params_best_low_xgb))
])

best_model_low = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=PowerTransformer(method='box-cox', standardize=False)
)

In [16]:
def train_cv_linear_combination(model_a, model_b, X, y, metric = 'rmse', kfold = 5, weights_a = [.25, .5, .75]):
    if metric != 'rmse':
        raise NotImplemented
    
    kf = KFold(n_splits=kfold, shuffle=True, random_state=42)
    results = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f"Fold {fold + 1}")

        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model_a.fit(X_train, y_train)
        model_b.fit(X_train, y_train)

        preds_a = model_a.predict(X_val)
        preds_b = model_b.predict(X_val)

        for w in weights_a:
            preds = preds_a * w + preds_b * (1 - w)
            rmse = np.sqrt(mean_squared_error(y_val, preds))

            results.append((w, 1 - w, fold + 1, rmse))

    results_df = pd.DataFrame(results, columns=["Weight Model A", "Weight Model B", "Fold", "RMSE"])

    return results_df.groupby(["Weight Model A", "Weight Model B"])["RMSE"].mean().reset_index()


In [37]:
cv_results_linear = train_cv_linear_combination(
    best_model_high, 
    best_model_low, 
    dataset_train, 
    dataset_train['UltimateIncurredClaimCost'],
    weights_a = [.2, .4, .6, .8, .9, .95]
)

Fold 1
[LightGBM] [Info] Total Bins 85035
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.936574
Fold 2
[LightGBM] [Info] Total Bins 85031
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.924491
Fold 3
[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.933449
Fold 4
[LightGBM] [Info] Total Bins 85034
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.930122
Fold 5
[LightGBM] [Info] Total Bins 85018
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.944325


In [38]:
cv_results_linear.columns = ['LGBM (best high)', 'XGB (best low)', 'RMSE']
cv_results_linear

Unnamed: 0,LGBM (best high),XGB (best low),RMSE
0,0.2,0.8,24082.831418
1,0.4,0.6,23970.52354
2,0.6,0.4,23903.391255
3,0.8,0.2,23881.808025
4,0.9,0.1,23888.142823
5,0.95,0.05,23895.589846


Best hybrid (linear combination) is 80% LGBM and 20% XGB

In [19]:
def train_cv_threshold_switch(model_a, model_b, X, y, threshold=100000, metric='rmse', kfold=5):
    if metric != 'rmse':
        raise NotImplementedError("Only RMSE is implemented.")

    kf = KFold(n_splits=kfold, shuffle=True, random_state=42)
    scores = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f"Fold {fold + 1}")

        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model_a.fit(X_train, y_train)
        model_b.fit(X_train, y_train)

        preds_a = model_a.predict(X_val)
        preds_b = model_b.predict(X_val)

        final_preds = np.where(preds_a < threshold, preds_a, preds_b)

        rmse = np.sqrt(mean_squared_error(y_val, final_preds))
        scores.append(rmse)

    return np.mean(scores)


In [20]:
rmse_hybrid_thresh_50k = train_cv_threshold_switch(best_model_high, best_model_low, dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 50000)

rmse_hybrid_thresh_50k

Fold 1
[LightGBM] [Info] Total Bins 85035
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.936574
Fold 2
[LightGBM] [Info] Total Bins 85031
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.924491
Fold 3
[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.933449
Fold 4
[LightGBM] [Info] Total Bins 85034
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.930122
Fold 5
[LightGBM] [Info] Total Bins 85018
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.944325


np.float64(24169.40174339567)

In [21]:
rmse_hybrid_thresh_50k

np.float64(24169.40174339567)

In [22]:
rmse_hybrid_thresh_30k = train_cv_threshold_switch(best_model_high, best_model_low, dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 30000)

Fold 1
[LightGBM] [Info] Total Bins 85035
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.936574
Fold 2
[LightGBM] [Info] Total Bins 85031
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.924491
Fold 3
[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.933449
Fold 4
[LightGBM] [Info] Total Bins 85034
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.930122
Fold 5
[LightGBM] [Info] Total Bins 85018
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.944325


In [23]:
rmse_hybrid_thresh_30k

np.float64(24217.801825151408)

In [24]:
rmse_hybrid_thresh_10k = train_cv_threshold_switch(best_model_high, best_model_low, dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 10000)

Fold 1
[LightGBM] [Info] Total Bins 85035
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.936574
Fold 2
[LightGBM] [Info] Total Bins 85031
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.924491
Fold 3
[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.933449
Fold 4
[LightGBM] [Info] Total Bins 85034
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.930122
Fold 5
[LightGBM] [Info] Total Bins 85018
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.944325


In [25]:
rmse_hybrid_thresh_10k

np.float64(24234.895772132604)

In [26]:
rmse_hybrid_thresh_80k = train_cv_threshold_switch(best_model_high, best_model_low, dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 80000)

Fold 1
[LightGBM] [Info] Total Bins 85035
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.936574
Fold 2
[LightGBM] [Info] Total Bins 85031
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.924491
Fold 3
[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.933449
Fold 4
[LightGBM] [Info] Total Bins 85034
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.930122
Fold 5
[LightGBM] [Info] Total Bins 85018
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.944325


In [27]:
rmse_hybrid_thresh_80k

np.float64(24080.86036365586)

In [28]:
rmse_hybrid_thresh_100k = train_cv_threshold_switch(best_model_high, best_model_low, dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 100000)

Fold 1
[LightGBM] [Info] Total Bins 85035
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.936574
Fold 2
[LightGBM] [Info] Total Bins 85031
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.924491
Fold 3
[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.933449
Fold 4
[LightGBM] [Info] Total Bins 85034
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.930122
Fold 5
[LightGBM] [Info] Total Bins 85018
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.944325


In [29]:
rmse_hybrid_thresh_100k

np.float64(23999.466951235314)

In [30]:
rmse_hybrid_thresh_150k = train_cv_threshold_switch(best_model_high, best_model_low, dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 150000)

Fold 1
[LightGBM] [Info] Total Bins 85035
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.936574
Fold 2
[LightGBM] [Info] Total Bins 85031
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.924491
Fold 3
[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.933449
Fold 4
[LightGBM] [Info] Total Bins 85034
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.930122
Fold 5
[LightGBM] [Info] Total Bins 85018
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.944325


In [31]:
rmse_hybrid_thresh_150k

np.float64(23893.34845962478)

In [32]:
rmse_hybrid_thresh_200k = train_cv_threshold_switch(best_model_high, best_model_low, dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 200000)

Fold 1
[LightGBM] [Info] Total Bins 85035
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.936574
Fold 2
[LightGBM] [Info] Total Bins 85031
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.924491
Fold 3
[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.933449
Fold 4
[LightGBM] [Info] Total Bins 85034
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.930122
Fold 5
[LightGBM] [Info] Total Bins 85018
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.944325


In [33]:
rmse_hybrid_thresh_200k

np.float64(23895.61309091909)

### Hybrid results 

The best hybrids are:
* Linear combination: 80% LightGBM/20% XGBoost
* Prediction threshold switch: Use prediction of LightGBM if over 150k. If not, use XGBoost.

Train on full dataset and prediction for test 

In [36]:
best_model_high.fit(dataset_train, dataset_train['UltimateIncurredClaimCost'])
best_model_low.fit(dataset_train, dataset_train['UltimateIncurredClaimCost'])

[LightGBM] [Info] Total Bins 85037
[LightGBM] [Info] Number of data points in the train set: 53999, number of used features: 395
[LightGBM] [Info] Start training from score 5.933758


In [39]:
y_test_pred_lgbm = best_model_high.predict(dataset_test)
y_test_pred_lgbm



array([ 6694.60384304,  2670.70323247, 16927.17956708, ...,
        6759.86278276,  7919.74898895,   404.66152841], shape=(36000,))

In [40]:
y_test_pred_xgb = best_model_low.predict(dataset_test)
y_test_pred_xgb

array([ 5850.143  ,  2790.7683 , 19612.176  , ...,  5887.704  ,
        6923.9497 ,   454.76395], shape=(36000,), dtype=float32)

Hybrid 1: linear combination:

In [None]:
# 80% lgbm, 20% xgb
y_pred_hybrid1 = y_test_pred_lgbm * .8 + y_test_pred_xgb * .2
y_pred_hybrid1

array([ 6525.71176096,  2694.7162725 , 17464.1789564 , ...,
        6585.43111976,  7720.58910815,   414.68201203], shape=(36000,))

In [42]:
submission_df = pd.DataFrame(zip(dataset_test.index, y_pred_hybrid1), columns=['ClaimNumber', 'UltimateIncurredClaimCost'])

print(submission_df.shape)
submission_df.head()

(36000, 2)


Unnamed: 0,ClaimNumber,UltimateIncurredClaimCost
0,WC8145235,6525.711761
1,WC2005111,2694.716272
2,WC6899143,17464.178956
3,WC5502023,354.007427
4,WC4785156,2908.186211


In [43]:
submission_df.to_csv('../data/output/submission_hybrid_linear.csv', index=False)

Hybrid 2: switch

In [48]:
# lgbm if pred over 150k
y_pred_hybrid2 = np.where(y_test_pred_lgbm > 150000, y_test_pred_lgbm, y_test_pred_xgb)
y_pred_hybrid1

array([ 6525.71176096,  2694.7162725 , 17464.1789564 , ...,
        6585.43111976,  7720.58910815,   414.68201203], shape=(36000,))

In [50]:
submission_df = pd.DataFrame(zip(dataset_test.index, y_pred_hybrid2), columns=['ClaimNumber', 'UltimateIncurredClaimCost'])

print(submission_df.shape)
submission_df.head()

(36000, 2)


Unnamed: 0,ClaimNumber,UltimateIncurredClaimCost
0,WC8145235,5850.143066
1,WC2005111,2790.768311
2,WC6899143,19612.175781
3,WC5502023,347.624939
4,WC4785156,3707.702148


In [51]:
submission_df.to_csv('../data/output/submission_hybrid_switch.csv', index=False)