# Dataset ensemble

In [None]:
import logging
from logging.handlers import RotatingFileHandler
from pathlib import Path

import h5py
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor

%cd ..
from ensembleset.dataset import DataSet
import examples.functions.plotting_functions as plot_funcs
import examples.functions.helper_functions as helper_funcs


example_name = 'calories'
log_path = 'examples/logs'

Path(log_path).mkdir(parents=True, exist_ok=True)
helper_funcs.delete_old_logs(log_path, example_name)

function_logger = logging.getLogger(__name__ + '.optimization_run')

logging.basicConfig(
    handlers=[RotatingFileHandler(
        f'{log_path}/{example_name}.log',
        maxBytes=10 * 1024 * 1024,
        backupCount=10
    )],
    level=logging.INFO,
    format='%(levelname)s - %(name)s - %(message)s'
)

## 1. Data preparation

In [None]:
data_df=pd.read_csv('examples/example_data/calories.csv')
data_df.drop('id', axis=1, inplace=True)
train_df, test_df=train_test_split(data_df, test_size=0.5)
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)
train_df.info()

## 2. Gradient boosting: original data

In [None]:
model=HistGradientBoostingRegressor(loss='gamma')
model.fit(
    train_df.drop('Calories', axis=1).replace({'male': 0, 'female': 1}),
    train_df['Calories']
)

predictions=model.predict(test_df.drop('Calories', axis=1).replace({'male': 0, 'female': 1}))
labels=test_df['Calories']
rmsle=np.sqrt(mean_squared_log_error(labels, predictions))

plot_title=f'HistGradientBoostingRegressor(): RMSLE {rmsle:.4f}'
plot_funcs.model_eval(plot_title, 'calories burned', predictions, labels)

## 2. Gradient boosting ensemble model

### 3.1. Generate ensembleset

In [None]:
ensemble_members=100

In [None]:
%%time

data_ensemble=DataSet(
    label='Calories',
    train_data=train_df,
    test_data=test_df,
    string_features=['Sex'],
    data_directory='examples/ensemblesets',
    ensembleset_base_name='calories'
)

ensembleset_file = data_ensemble.make_datasets(
    n_datasets=ensemble_members,
    frac_features=0.7,
    n_steps=3
)

### 3.2. Stage I model

In [None]:
stage_one_models={}
stage_one_test_predictions={}

with h5py.File(f'examples/ensemblesets/{ensembleset_file}', 'r') as hdf:
    for i in range(ensemble_members):

        print(f'\rFitting model {i}', end='')
    
        stage_one_models[i]=HistGradientBoostingRegressor(loss='gamma')
        stage_one_models[i].fit(hdf[f'train/{i}'], hdf[f'train/labels'])
        stage_one_test_predictions[i]=stage_one_models[i].predict(hdf[f'test/{i}'])

    stage_two_training_df=pd.DataFrame.from_dict(stage_one_test_predictions)
    stage_two_training_df['labels']=hdf[f'test/labels']

stage_two_training_df.head()

### 3.2. Stage II model

#### 3.2.1. Cross-validation

In [None]:
scores=cross_val_score(
    HistGradientBoostingRegressor(loss='gamma'),
    stage_two_training_df.drop('labels', axis=1),
    stage_two_training_df['labels'],
    scoring='neg_mean_squared_log_error',
    n_jobs=-1,
    cv=7
)

print(f'Cross-validation RMSLE: {np.mean(np.sqrt(-scores)):.4f} +/- {np.std(np.sqrt(-scores)):.4f}')

#### 3.2.2. Test set evaluation

In [None]:
train_df, test_df=train_test_split(stage_two_training_df, test_size=0.25)
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

model=HistGradientBoostingRegressor(loss='gamma')
model.fit(train_df.drop('labels', axis=1), train_df['labels'])

predictions=model.predict(test_df.drop('labels', axis=1))
labels=test_df['labels']
rmsle=np.sqrt(mean_squared_log_error(labels, predictions))

plot_title=f'Stage II EnsembleSet model: RMSLE {rmsle:.4f}'
plot_funcs.model_eval(plot_title, 'calories burned', predictions, labels)