In [1]:
import os

import numpy as np
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from bclassification.utils_fc import (
    print_dataset,
    obs_to_vect_with_tc,
    obs_vects_to_vect,
)
from bclassification.utils_skl import train, train_model
from experience import load_experience
from lib.action_space import is_do_nothing_action
from lib.constants import Constants as Const
from lib.data_utils import make_dir, env_pf, extract_target_windows, moving_window
from lib.dc_opf import TopologyConverter
from lib.visualizer import Visualizer, pprint

Visualizer()

# experience_dir = make_dir(os.path.join(Const.EXPERIENCE_DIR, "data-aug"))
experience_dir = make_dir(os.path.join(Const.RESULTS_DIR, "performance-aug"))
results_dir = make_dir(os.path.join(Const.RESULTS_DIR, "bc-fc"))

agent_name = "agent-mip"
case_name = "l2rpn_2019_art"
env_dc = True
verbose = False

case_results_dir = make_dir(os.path.join(results_dir, f"{case_name}-{env_pf(env_dc)}"))
case, collector = load_experience(case_name, agent_name, experience_dir, env_dc=env_dc)

pprint("    - Number of chronics:", len(collector.chronic_ids))


L2RPN_2019_ART (dc)


--------------------------------------------------------------------------------
                                        Loading Experience
--------------------------------------------------------------------------------
    - Loading chronics:                 ./results/performance-aug/l2rpn_2019_art-dc/agent-mip-chronic-****
    - Number of chronics:               67


In [2]:
"""
    Parameters
"""
from sklearn.svm import SVC

random_seed = 1

model_type = "skl"

n_window_targets = 0
n_window_history = 1

test_frac = 0.10
downsampling_rate = 0.05

param_dist = {
    "svm": {
        "param_dist": {
            "C": uniform(0.2, 1.2),
        },
        "cls": SVC(class_weight='balanced',random_state=random_seed, probability=True),
    },
    "et": {
        "param_dist": {
            "criterion": ["gini", "entropy"],
            "max_depth": sp_randint(1, 3),
            "n_estimators": sp_randint(10, 20),
        },
        "cls": ExtraTreesClassifier(
            random_state=random_seed, n_jobs=-1, class_weight="balanced"
        ),
    },
    "ada": {
        "param_dist": {"n_estimators": sp_randint(10, 20)},
        "cls": AdaBoostClassifier(learning_rate=0.1, random_state=random_seed),
    },
    "rf": {
        "param_dist": {
            "criterion": ["gini", "entropy"],
            "max_depth": sp_randint(1, 3),
            "n_estimators": sp_randint(10, 20),
        },
        "cls": RandomForestClassifier(
            class_weight="balanced", random_state=random_seed, n_jobs=-1
        ),
    },
    "dtc": {
        "param_dist": {
            "max_depth": sp_randint(1, 3),
            "min_samples_split": uniform(0, 1.0),
        },
        "cls": DecisionTreeClassifier(
            class_weight="balanced", random_state=random_seed
        ),
    },
}

In [None]:
"""
    Datasets
"""
from lib.data_utils import indices_to_hot
from lib.data_utils import extract_history_windows

np.random.seed(random_seed)

labels = []
mask_targets = []
Y_all = []
X_all = []

obs_to_vect = obs_to_vect_with_tc(TopologyConverter(case.env))

for chronic_idx, chronic_data in collector.data.items():
    chronic_obses = chronic_data["obses"][:-1]
    chronic_labels = is_do_nothing_action(chronic_data["actions"], case.env, dtype=np.bool)
        
    mask_positives = extract_history_windows(chronic_labels, n_window=n_window_targets)
    mask_negatives = np.logical_and(
        np.random.binomial(1, downsampling_rate, len(chronic_labels)).astype(np.bool), 
        ~mask_positives
    )
    chronic_mask_targets = np.logical_or(chronic_labels, mask_negatives)
    
    # Observation history features
    chronic_X_obses = moving_window(
        chronic_obses,
        n_window=n_window_history,
        process_fn=obs_to_vect,
        combine_fn=obs_vects_to_vect,
        padding=np.zeros_like(obs_to_vect(chronic_obses[0])),
    )

    # Action history features
    chronic_actions = np.roll(chronic_labels, 1).astype(np.float)
    chronic_actions[0] = 0.0

    chronic_X_actions = moving_window(
            chronic_actions,
            n_window=n_window_history,
            process_fn=lambda x: 10.0 * indices_to_hot([int(x)], length=2, dtype=np.float),
            combine_fn=lambda x: np.concatenate(x),
            padding=np.zeros((2, )),
    )

    chronic_X = np.hstack((chronic_X_obses, chronic_X_actions))

    labels.extend(chronic_labels)
    mask_targets.extend(chronic_mask_targets)
    X_all.extend(chronic_X)
    Y_all.extend(chronic_labels)
    
labels = np.array(labels)
mask_targets = np.array(mask_targets)
X_all = np.vstack(X_all).astype(np.float)
Y_all = np.array(Y_all).astype(np.int)

X = X_all[mask_targets, :]
Y = Y_all[mask_targets]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=test_frac, random_state=random_seed
)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=test_frac, random_state=random_seed
)

pprint(
    "    - Labels:",
    f"{labels.sum()}/{labels.size}",
    "{:.2f} %".format(100 * labels.mean()),
)

print_dataset(X_all, Y_all, "All data")
print_dataset(X, Y, "Data")
print_dataset(X_train, Y_train, "Train")
print_dataset(X_val, Y_val, "Validation")
print_dataset(X_test, Y_test, "Test")

In [None]:
train(
    train_data=(X_train, Y_train),
    test_data=(X_test, Y_test),
    scaling=True,
    power_scaling=True,
    random_search=(True, param_dist, 10,),
    cross_validation=(True, 5, "balanced_accuracy"),
)

In [8]:
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (
    accuracy_score,
    matthews_corrcoef,
    precision_recall_fscore_support,
)
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer

from lib.visualizer import pprint

def describe_dataset(x, y, name):
    labels = np.unique(y)
    count = [np.count_nonzero(y == c) for c in labels]
    weights = [c / sum(count) for c in count]

    pprint(f"    - {name}", "X" + str(x.shape), "Y" + str(y.shape))
    pprint("        - Labels:", str(labels))
    pprint("        - Count:", str(count))
    pprint("        - Weights:", str(weights))
    return labels, count, weights


def train_model(
        train_data,
        test_data,
        model,
        scaling=True,
        power_scaling=False,
        random_search=(False,),
        cross_validation=(False, 5, "balanced_accuracy"),
):
    x_train, y_train = train_data
    x_test, y_test = test_data

    # Load data
    describe_dataset(x_train, y_train, "Train")
    describe_dataset(x_test, y_test, "Test")

    # Feature scaling
    if scaling:
        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

    if power_scaling:
        ptransformer = PowerTransformer()
        ptransformer.fit(x_train)
        x_train = ptransformer.transform(x_train)
        x_test = ptransformer.transform(x_test)

    # Training
    best_est = None
    if random_search[0]:
        rs = RandomizedSearchCV(
            model,
            param_distributions=random_search[1],
            n_iter=random_search[2],
            cv=5,
            n_jobs=-1,
            random_state=0,
            scoring=random_search[3],
        )
        rs.fit(x_train, y_train.ravel())
        best_est = rs.best_estimator_
        pprint(f"    - Random Search best:", str(rs.best_estimator_))

    if best_est:
        model = best_est

    model.fit(x_train, y_train.ravel())

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    pprint("    - MCC:", "")
    pprint("        - Train:", matthews_corrcoef(y_train, y_train_pred))
    pprint("        - Test:", matthews_corrcoef(y_test, y_test_pred))

    if cross_validation[0]:
        pprint("    - CV:", f"{cross_validation[1]}-fold")
        for metric in ["f1", "accuracy", "balanced_accuracy", "recall", "precision"]:
            score = cross_val_score(
                model,
                np.vstack((x_train, x_test)),
                np.hstack((y_train, y_test)).ravel(),
                cv=cross_validation[1],
                scoring=metric,
                n_jobs=-1,
            )
            pprint("    - Metric:", metric)
            pprint("        - Mean:", score.mean())
            pprint("        - Std:", score.std())

In [None]:
for model_name, model_params in param_dist.items():
    pprint("Model:", model_name.upper())
    train_model(
        train_data=(X_train, Y_train),
        test_data=(X_test, Y_test),
        model=model_params["cls"],
        scaling=True,
        power_scaling=True,
        random_search=(True, model_params["param_dist"], 5, "f1"),
        cross_validation=(True, 5),
    )

Model:                                  SVM
    - Train                             X(20208, 496)	Y(20208,)
        - Labels:                       [0 1]
        - Count:                        [16837, 3371]
        - Weights:                      [0.8331848772763262, 0.1668151227236738]
    - Test                              X(2495, 496)	Y(2495,)
        - Labels:                       [0 1]
        - Count:                        [2054, 441]
        - Weights:                      [0.823246492985972, 0.17675350701402806]


divide by zero encountered in log


    - Random Search best:               SVC(C=1.0582272396469035, class_weight='balanced', probability=True,
    random_state=1)
    - MCC:                              
        - Train:                        0.5319812482799416
        - Test:                         0.444545988772654
    - CV:                               5-fold
    - Metric:                           f1
        - Mean:                         0.5392353378630034
        - Std:                          0.007308380237401398
    - Metric:                           accuracy
        - Mean:                         0.7703826031449145
        - Std:                          0.007437372312604266


In [None]:
for model_name, model_params in param_dist.items():
    pprint("Model:", model_name.upper())
    if model_name == "svm:":
        continue
        
    train_model(
        train_data=(X_train, Y_train),
        test_data=(X_test, Y_test),
        model=model_params["cls"],
        scaling=True,
        power_scaling=True,
        random_search=(True, model_params["param_dist"], 5, "f1"),
        cross_validation=(True, 5),
    )