# Step 4: Create models with SD

In [4]:
import pandas as pd 
import os 
import sys 
import pickle
import re

from pycaret.classification import ClassificationExperiment
from pycaret.containers.models.classification import get_all_model_containers

from sklearn.metrics import (classification_report, 
                             roc_auc_score, 
                             matthews_corrcoef,
                             cohen_kappa_score)
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_validate

# Import help methods
sys.path.append('../src')
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   run_pycaret_setup, 
                   translate_model_name,
                   get_synthetic_filepaths_from_original_data_id,
                   convert_and_clean_dict)

from tuning_grids import Grids

# Get global variables for the experiment
config = getExperimentConfig()
# Get folders
folders = config['folders']
# Load dataset specific settings (from the real-data)
dataset_settings = getPicklesFromDir(folders['settings_dir'])

In [7]:
import numpy as np
# read performance data from Step 2
model_performance_df = pd.read_csv(folders['model_perf_filepath'])
# Specify the metrics to sort by for choosing best model
# Choose the target metric when tuning the models
sort_by = config['clf']['tuning_param']['optimize']

run_dataset = config['run_dataset']

for settings in dataset_settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        continue
    print(f"Starting model training for {settings['meta']['id']}.")    
    settings['setup_param']['fold'] = config['clf']['cv_folds']
    # update system_log name
    settings['setup_param']['system_log'] = folders['log_dir']+"Step4_SD"
    # disable saving train-test split data (to save space)
    settings['setup_param']['log_data'] = False
    target_label=settings['meta']['target']
    
    # Get experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
     
    # load original dataset
    cols_dtype=None
    if settings['meta']['cols_dtype'] != None:
        cols_dtype = settings['meta']['cols_dtype']
        
    original_data = pd.read_csv(f"{folders['real_dir']}{settings['meta']['filename']}", dtype=cols_dtype)
    
    logg_tags = {
        'Trained on': 'synthetic',
        'Tuned on': 'original',
    }
    
    # Filter the DataFrame based on the Dataset id and sort by specified column
    filtered_df = model_performance_df[model_performance_df["Dataset id"] == settings['meta']['id']]
    

    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(settings['meta']['id'])

    for sd_filename in synthetic_datasets:
        sd_id = os.path.splitext(sd_filename)[0]
        print(f"Starting training for synthetic dataset {sd_id}.")
        quality = re.findall('Q\d+', sd_id)[0]
        sd_path = folders['sd_dir']+sd_filename
        settings['setup_param']['verbose'] = False

        #retrieve test data: sample from original data
        train_size = settings['setup_param']['train_size']
        test_data = original_data.groupby(
            target_label, group_keys=False).apply(
                lambda x: x.sample(int(np.rint(train_size*len(x))))).sample(frac=1).reset_index(drop=True)
        settings['setup_param']['test_data'] = test_data
        settings['setup_param']['index'] = False
        s = run_pycaret_setup(sd_path, settings['setup_param'], meta=settings['meta'])

        for _, row in filtered_df.iterrows():
        
            ml_model = row.model
                  
            # Add custom tags to the logg, defining dataset type, and Id
            logg_tags = {
                'Dataset id': sd_id,
                'model': ml_model,
                'Quality': quality,
                'Trained on': 'synthetic',
                'Tuned on': 'synthetic',
                'SDG': sd_id.split("_")[0],
            }
            # Check if the setup has already been evaluated
            row_exists = ((model_performance_df['Dataset id'] == logg_tags['Dataset id']) & 
                          (model_performance_df['model'] == logg_tags['model']) &
                          (model_performance_df['Tuned on'] == logg_tags['Tuned on']) &
                          (model_performance_df['Trained on'] == logg_tags['Trained on'])).any()
        
            if not row_exists:
                print(f"Training {ml_model}...", end="")             
                # train the model on synthetic data
                model = s.create_model(ml_model)  
                print("done.")
                print(f"Tuning {ml_model}...", end="")
                # Get tuning grid
                tune_grid = Grids.get_tuning_grid(ml_model)
                # Is buggy, use default tuning by pycaret
                model = s.tune_model(model) #, custom_grid=tune_grid, **config['clf']['tuning_param'])
                print("done.")
                # get validation results
                val_df = s.pull()
                val_score = {}
                val_score['val_Accuracy'] = val_df['Accuracy']['Mean']
                val_score['val_F1'] = val_df['F1']['Mean']

                metrics_list = []
                #run monte carlo stratified cross-validation using StratifiedShuffelSplit
                x_test = s.get_config("X_test_transformed")
                y_test = s.get_config("y_test_transformed")
                

                #x_test_transformed = s.pipeline.transform(x_test)
                # Rearrange the column order in the same order as the train data
                #x_test = x_test[s.X_train_transformed.columns]
                
                y_pred = model.predict(x_test)

                metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
                test_metrics = {
                    "Accuracy": metrics['accuracy'],
                    "Precision_macro": metrics['macro avg']['precision'],
                    "Recall_macro": metrics['macro avg']['recall'],
                    "F1_macro": metrics['macro avg']['f1-score'],
                    "Precision_weighted": metrics['weighted avg']['precision'],
                    "Recall_weighted": metrics['weighted avg']['recall'],
                    "F1_weighted": metrics['weighted avg']['f1-score'],
                    "MCC": matthews_corrcoef(y_true=y_test, y_pred=y_pred),
                    "Kappa": cohen_kappa_score(y1=y_test, y2=y_pred)
                }
                metrics_list.append(test_metrics)

                # Convert the list of dictionaries to a DataFrame
                metrics_df = pd.DataFrame(metrics_list)

                # Calculate the average of each column
                average_metrics = metrics_df.mean()
                # save results
                performance_row = {**logg_tags, **test_metrics}
                performance_row['Params'] = model.get_params()
                # model_performance_df.append(performance_row, ignore_index=True)
                model_performance_df = pd.concat([model_performance_df, pd.DataFrame(performance_row)], ignore_index=True)
                ########### End test hyper-param ###########        


        # update model performance to csv after each sd_id
        model_performance_df.to_csv(folders['model_perf_filepath'], index=False)

Starting model training for D00.
Starting training for synthetic dataset SD00Q10_0.
Starting training for synthetic dataset SD00Q10_1.
Starting training for synthetic dataset SD00Q10_2.
Starting training for synthetic dataset SD00Q10_3.
Starting training for synthetic dataset SD00Q10_4.
Starting training for synthetic dataset SD00Q10_5.
Starting training for synthetic dataset SD00Q10_6.
Starting training for synthetic dataset SD00Q10_7.
Starting training for synthetic dataset SD00Q10_8.
Starting training for synthetic dataset SD00Q10_9.
