In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import time
import pickle

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from catboost import CatBoostRegressor

import matplotlib.ticker as mtick
import matplotlib.patches as mpatches
from pandas.plotting import table
import dataframe_image as dfi

from sklearn.model_selection import  GroupKFold, cross_validate
from sklearn.metrics import make_scorer

In [2]:
def evaluate_performance(df, features, dataset, groups, model=RandomForestRegressor(random_state=42)):
    
    if dataset == 'Nasa':
            
        X = df.drop(['cycle', 'cell', 'SoH', 'group'], axis=1)
        y = df['SoH']

        X = X[features]
    
    elif dataset == 'Calce':
        X = df.drop(['cycle', 'cell', 'SoH', 'group'], axis=1)
        y = df['SoH']
        X = X[features]
    
    gkf = GroupKFold(n_splits=4)

    scoring_metrics = {
        'mse': 'neg_mean_squared_error',
        'rmse': make_scorer(mean_squared_error, squared=False),
        'r2': 'r2',
        'mae': 'neg_mean_absolute_error',
        'mape': make_scorer(mean_absolute_percentage_error)
    }

    start_time = time.time()
    
    # Replace special characters in column names
    X.columns = X.columns.str.replace('[^\w\s]', '')


    results = cross_validate(model, X, y, cv=gkf, groups=groups, scoring=scoring_metrics, n_jobs=-1, return_train_score=False)

    end_time = time.time()

    result = {}
    result['mse'] = -np.mean(results['test_mse'])
    result['rmse'] = np.mean(results['test_rmse'])
    result['r2'] = np.mean(results['test_r2'])
    result['mae'] = -np.mean(results['test_mae'])
    result['mape'] = np.mean(results['test_mape'])
    result['time'] = end_time - start_time
    
    return result


def load_data(dataset, regressor):
    
    if dataset == 'Nasa':
        B0005 = pd.read_csv('../Data & Outputs/Nasa/B0005_pro.csv')
        B0006 = pd.read_csv('../Data & Outputs/Nasa/B0006_pro.csv')
        B0007 = pd.read_csv('../Data & Outputs/Nasa/B0007_pro.csv')
        B0018 = pd.read_csv('../Data & Outputs/Nasa/B0018_pro.csv')
        df = pd.concat([B0005, B0006, B0007, B0018], axis=0)
        cell_to_group = {'B0005': 1, 'B0006': 2, 'B0007': 3, 'B0018': 4}

    elif dataset == 'Calce':
        CS2_35 = pd.read_csv('../Data & Outputs/Calce/CS2_35_pro.csv')
        CS2_36 = pd.read_csv('../Data & Outputs/Calce/CS2_36_pro.csv')
        CS2_37 = pd.read_csv('../Data & Outputs/Calce/CS2_37_pro.csv')
        CS2_38 = pd.read_csv('../Data & Outputs/Calce/CS2_38_pro.csv')

        df = pd.concat([CS2_35, CS2_36, CS2_37, CS2_38], axis=0)
        cell_to_group = {'CS2_35': 1, 'CS2_36': 2, 'CS2_37': 3, 'CS2_38': 4}
    df['group'] = df['cell'].apply(lambda x: cell_to_group[x])
    groups = list(df['group'].sort_values())

    return df, groups

def create_model(dataset, regressor, type):

    if type == 'Hyper':
        with open(f'../Data & Outputs/{dataset}/{regressor}/{regressor}_best_model.pkl', 'rb') as f:
            model = pickle.load(f)
    
    elif type == 'Regular': 
        if regressor == 'RF':
            model = RandomForestRegressor(n_jobs=-1, random_state=42)
        elif regressor == 'XGB':
            model = XGBRegressor(n_jobs=-1, random_state=42)
        elif regressor == 'LGBM':
            model = LGBMRegressor(n_jobs=-1, random_state=42)
        elif regressor == 'CAT':
            model = CatBoostRegressor(silent=True, random_state=42)

    return model

def save_results(df_result, dataset, regressor, type):
    if type == 'Hyper':
        df_result.to_csv(f'../Data & Outputs/{dataset}/{regressor}/{regressor}_results_hyper.csv')

        df_styled = df_result.style.background_gradient()
        df_styled.export_png(f'../Data & Outputs/{dataset}/{regressor}/{regressor}_results_hyper.png', dpi=100)

    elif type == 'Regular':
        df_result.to_csv(f'../Data & Outputs/{dataset}/{regressor}/{regressor}_results_regular.csv')

        df_styled = df_result.style.background_gradient()
        df_styled.export_png(f'../Data & Outputs/{dataset}/{regressor}/{regressor}_results_regular.png', dpi=100)

In [3]:
datasets = ['Nasa', 'Calce']
regressors = ['RF', 'XGB', 'LGBM', 'CAT']
types = ['Hyper', 'Regular']

for dataset in datasets:
    for regressor in regressors:
        for type in types:
        

            df, groups = load_data(dataset, regressor)
            model = create_model(dataset, regressor, type)

            print(f'Dataset: {dataset}')
            print(f'Regressor: {regressor}')
            print(f'Type: {type}')
            print(f'Groups: {groups}')

            if dataset == 'Nasa':
                with open(f'../Data & Outputs/Nasa/{regressor}/{regressor}_selected_features.pkl', 'rb') as f:
                    corr_features, mi_features, seq_fea_sel_features, rfe_cv_features = pickle.load(f)

                X = df.drop(['cycle', 'cell', 'SoH', 'group'], axis=1)
                y = df['SoH']
            
            elif dataset == 'Calce':
                X = df.drop(['cycle', 'cell', 'SoH', 'group'], axis=1)
                y = df['SoH']

            all_features = list(X.columns)
            
            if dataset == 'Nasa':
                ep_all = evaluate_performance(df, all_features, dataset, groups, model)
                ep_corr = evaluate_performance(df, corr_features, dataset,groups, model)
                ep_chi2 = evaluate_performance(df, mi_features, dataset,groups, model)
                ep_sfs = evaluate_performance(df, seq_fea_sel_features, dataset,groups, model)
                ep_rfe = evaluate_performance(df, rfe_cv_features, dataset,groups, model)
            
            elif dataset == 'Calce':
                ep_all = evaluate_performance(df, all_features, dataset, groups, model)
                ep_CCCT = evaluate_performance(df, ['CCCT'], dataset, groups, model)
                ep_CVCT = evaluate_performance(df, ['CVCT'], dataset, groups, model)

            if dataset == 'Nasa':
                
                # create a list of the dictionaries
                dict_list = [ep_all, ep_corr, ep_chi2, ep_sfs, ep_rfe]
                dict_list_name = ['base', 'corr', 'mi', 'sfs', 'rfe']
                df_result = pd.DataFrame(dict_list, index=[d for d in dict_list_name])
                save_results(df_result, dataset, regressor, type)
            
            elif dataset == 'Calce':
                dict_list = [ep_all, ep_CCCT, ep_CVCT]
                dict_list_name = ['base', 'CCCT', 'CVCT']
                df_result = pd.DataFrame(dict_list, index=[d for d in dict_list_name])
                save_results(df_result, dataset, regressor, type)

Dataset: Nasa
Regressor: RF
Type: Hyper
Groups: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 