# Step 2: Preprocessing & Classification model
This section will load up the defined settings from the pickles directory and run the machine learning pipeline with the help of the `pycaret` library and save respective data.

In [None]:
# importing all packages needed in this section
import pandas as pd
import os
import sys 

from sklearn.metrics import (classification_report,  
                             matthews_corrcoef,
                             cohen_kappa_score)
from sklearn.model_selection import train_test_split

from pycaret.classification import *
from pycaret.containers.models.classification import get_all_model_containers

# utility functions for the experiment
sys.path.append('../src')

from tuning_grids import Grids
from utils import getPicklesFromDir, getExperimentConfig, run_pycaret_setup, translate_model_name
# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']
# get a list of all settings for the datasets prepared beforehand
dataset_settings = getPicklesFromDir(folders['settings_dir'])  

dataset_settings pickle is saved as follows:
```
"meta_data": meta_dataset,  # contains information about the dataset, including path
"setup_param": setup_param, # contains all the setup parameters for pycaret setup() function
"sdg_param": sdg_param,     # contains all sdg parameters for the CTGAN() function

```

In [None]:
# csv-file for model performance; if it exists, read it, else create a new one
if os.path.isfile(folders['model_perf_filepath']):
    model_performance_df = pd.read_csv(folders['model_perf_filepath'])
else:
    model_performance_df = pd.DataFrame()

#dictionary to save model performance
performance_row = {}

run_dataset = config['run_dataset']

for settings in dataset_settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        # Checks if run_dataset contains dataset_id's
        # if it does, run the experiment only on specified datasets
        continue
        
    # get path
    dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
    settings['setup_param']['fold'] = config['clf']['cv_folds']
    # run setup function
    s = run_pycaret_setup(dataset_path, settings['setup_param'], meta=settings['meta'])
    
    print(f"Dataset: {settings['meta']['id']}-{settings['meta']['name']}")
    
    logg_tags = {
        'Dataset id': settings['meta']['id'],
        'Tuned on': 'original',
        'Trained on': 'original'
    }
    
    # get the holdout data
    y_test = s.get_config('y_test_transformed')
    x_test_transformed = s.get_config('X_test_transformed')
    s.test.to_csv(f"{folders['real_dir']}{settings['meta']['id']}-{settings['meta']['name']}_test.csv", index=False)
    print(x_test_transformed.dtypes)
    
    # for each defined model in the global config
    # create specified model and tune it
    for ml_model in config['clf']['ml_models']:
        
        model_name = f"{settings['meta']['id']}-{translate_model_name(ml_model)}"
        print(model_name)
        
        logg_tags['model']=ml_model

        #all_models = get_all_model_containers(s)
        #model = all_models[ml_model].class_def()
        model = create_model(ml_model)

        tune_grid = Grids.get_tuning_grid(ml_model)  
        #print(f"Tune grid: {tune_grid}")
        model = s.tune_model(model, custom_grid=tune_grid, **config['clf']['tuning_param'])

        # get validation results
        val_dict = s.pull().to_dict()
               
        y_pred = model.predict(x_test_transformed)
        print(y_pred[:5])
        print(y_test[:5])

        metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
        holdout_score = pd.DataFrame.from_dict(metrics).transpose()
        
        print(holdout_score)
        test_metrics = {
            "Accuracy": metrics['accuracy'],
            "Precision_macro": metrics['macro avg']['precision'],
            "Recall_macro": metrics['macro avg']['recall'],
            "F1_macro": metrics['macro avg']['f1-score'],
            "Precision_weighted": metrics['weighted avg']['precision'],
            "Recall_weighted": metrics['weighted avg']['recall'],
            "F1_weighted": metrics['weighted avg']['f1-score'],
            "MCC": matthews_corrcoef(y_true=y_test, y_pred=y_pred),
            "Kappa": cohen_kappa_score(y1=y_test, y2=y_pred),
            "sklearn-report": metrics,
            "val_score": val_dict
        }
        # save results
        performance_row = {**logg_tags, **test_metrics}
        performance_row['Params'] = tuned_model.get_params()
        model_performance_df = model_performance_df.append(performance_row, ignore_index=True)
    # Save model performance to csv
    model_performance_df.to_csv(folders['model_perf_filepath'], index=False)

In [None]:
#%notify

### Testing that the provided hyperparameters work with pycaret and the system
#settings = dataset_settings[0]
#dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
#s = run_pycaret_setup(dataset_path, settings['setup_param'])
#for ml_model in ['rf', 'gbc', 'mlp']: #config['clf']['ml_models']:
#    # create & tune model
#    #model = s.create_model(ml_model)
#    #Quickfix for efficiency
#    all_models = get_all_model_containers(s)
#    model = all_models[ml_model].class_def()

#    tune_grid = Grids.get_tuning_grid(ml_model)
#    tuned_model = s.tune_model(model, 
#                               **config['clf']['tuning_param'], 
#                               custom_grid=tune_grid
#                              )

In [None]:

#type_of_target(y_df)

#from sklearn.preprocessing import LabelEncoder

#label_encoder = LabelEncoder()
#y = label_encoder.fit_transform(y_df)

#display(type(y))
#display(type_of_target(y))