In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import pickle

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor

from sklearn.model_selection import cross_val_score, GroupKFold


from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

In [2]:
def load_data(dataset):
    
    if dataset == 'Nasa':
        B0005 = pd.read_csv('../Data & Outputs/Nasa/B0005_pro.csv')
        B0006 = pd.read_csv('../Data & Outputs/Nasa/B0006_pro.csv')
        B0007 = pd.read_csv('../Data & Outputs/Nasa/B0007_pro.csv')
        B0018 = pd.read_csv('../Data & Outputs/Nasa/B0018_pro.csv')
        df = pd.concat([B0005, B0006, B0007, B0018], axis=0)
        
        # Define the mapping between cell names and group numbers
        cell_to_group = {'B0005': 1, 'B0006': 2, 'B0007': 3, 'B0018': 4}

    elif dataset == 'Calce':
        CS2_35 = pd.read_csv('../Data & Outputs/Calce/CS2_35_pro.csv')
        CS2_36 = pd.read_csv('../Data & Outputs/Calce/CS2_36_pro.csv')
        CS2_37 = pd.read_csv('../Data & Outputs/Calce/CS2_37_pro.csv')
        CS2_38 = pd.read_csv('../Data & Outputs/Calce/CS2_38_pro.csv')

        df = pd.concat([CS2_35, CS2_36, CS2_37, CS2_38], axis=0)

        # Define the mapping between cell names and group numbers
        cell_to_group = {'CS2_35': 1, 'CS2_36': 2, 'CS2_37': 3, 'CS2_38': 4}
    
    # Apply lambda function to 'cell' column to convert each cell name to its group number
    df['group'] = df['cell'].apply(lambda x: cell_to_group[x])

    # Create a list of group numbers by sorting the 'group' column of the dataframe
    groups = list(df['group'].sort_values())

    return df, groups

def create_model(regressor):
    if regressor == 'RF':
        model = RandomForestRegressor(n_jobs=-1, random_state=42)
    elif regressor == 'XGB':
        model = XGBRegressor(n_jobs=-1, random_state=42)
    elif regressor == 'LGBM':
        model = LGBMRegressor(n_jobs=-1, random_state=42)
    elif regressor == 'CAT':
        model = CatBoostRegressor(silent=True, random_state=42)
    print(f'Model: {model.__class__.__name__}')
    return model

def save_model(model, dataset, regressor):
    
    with open(f'../Data & Outputs/{dataset}/{regressor}/{regressor}_best_model.pkl', 'wb') as f:
        pickle.dump(model, f)

In [6]:
datasets = ['Nasa', 'Calce']
regressors = ['RF', 'XGB', 'LGBM', 'CAT']


for dataset in datasets:
    for regressor in regressors:

        df, groups = load_data(dataset)
        model = create_model(regressor)

        print(f'Dataset: {dataset}')
        print(f'Regressor: {regressor}')
        print(f'Groups: {groups}')

        
        if dataset == 'Nasa':
            with open(f'../Data & Outputs/Nasa/{regressor}/{regressor}_selected_features.pkl', 'rb') as f:
                corr_features, mi_features, seq_fea_sel_features, rfe_cv_features = pickle.load(f)

            X = df.drop(['cycle', 'cell', 'SoH', 'group'], axis=1)
            y = df['SoH']
            features = seq_fea_sel_features
            X = X[features]

        elif dataset == 'Calce':
            
            X = df.drop(['cycle', 'cell', 'SoH', 'group'], axis=1)
            y = df['SoH']

            features = list(X.columns)
            X = X[features]

        gkf=GroupKFold(n_splits=4)

        # Set parameter grids for each model
        if regressor == 'RF':
            space = [Integer(10, 1000, name='n_estimators'),
                    Integer(1, 20, name='max_depth'),
                    Real(10**-5, 10**0, "log-uniform", name='min_samples_split'),
                    Real(10**-5, 10**-1, "log-uniform", name='min_samples_leaf')]
        
        elif regressor == 'XGB':
            space  = [Integer(1, 20, name='max_depth'),
            Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
            Real(0.5, 1,"uniform", name='subsample'),
            Real(10**-5, 10**1, "uniform", name='gamma'),
            Real(10**-5, 10**0, "uniform", name='alpha')
            ]

        elif regressor == 'LGBM':
            space  = [Integer(2, 20, name='max_depth'),
            Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
            Real(0.5, 1, "uniform", name='subsample'),
            Real(10**-5, 10**1, "uniform", name='reg_alpha'),
            Real(10**-5, 10**1, "uniform", name='reg_lambda')]
            
        elif regressor == 'CAT':
            space  = [Integer(1, 10, name='depth'),
            Real(0.01, 0.5, "uniform", name='learning_rate'),
            Real(0.1, 1, "uniform", name='subsample'),
            Real(0.1, 10, "uniform", name='l2_leaf_reg')]

        @use_named_args(space)
        def objective(**params):
            model.set_params(**params)

            return -np.mean(cross_val_score(model, X, y, cv=gkf, groups=groups, n_jobs=-1,
                                            scoring="neg_root_mean_squared_error"))


        res_gp = gp_minimize(objective, 
                            space, n_calls=200, 
                            random_state=42,
                            n_jobs=-1,
                            verbose=True)

        # Print best hyperparameters
        print('Best hyperparameters:', model.get_params())

        # Save best model with tuned parameters
        save_model(model, dataset, regressor)


Model: RandomForestRegressor
Dataset: Nasa
Regressor: RF
Groups: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3