# Load packages

In [None]:
from core.DataLoader import *
from core.models.MLP import * 
from core.models.GAM import *
from core.models.SVM import *
# from core.models.EBM import *

from core.models.RandomForest import *
from core.Visualizer import *
from core.models.Classifier import *
from core.models.XGB import *

In [None]:
model_accs = {
    'MLP': [],
    'RF(5)': [],
    'RF(10)': [],
    'RF(U)': [],
    'DT(5)': [],
    'DT(10)': [],
    'DT(U)': [],
    'LR': [],
    'SVM': [],
    'XGB': []
}

In [None]:
weighted_clf_model_list ={
    "MLP": MLPClassifier,
    "DT": m_DecisionTreeClassifier,
    "RF": m_RandomForestClassifier,
    "LR": m_LogisticRegression,
    "SVM": SVMClassifier,
    "XGB": XGBOOSTClassifier
}
weighted_clf_model_name_list = ['MLP']

# Load data

In [None]:
engine = 'mssql' # change to postgres if needed
dl = DataLoader(engine)
one_file_dss, one_file_names = dl.get_one_file_ds(return_type='ds and names', datasets=['ssb', 'imdb'])
present_idxs = list(range(len(one_file_dss)))

In [None]:
classification_target = dl.classification_target
features = ['sel_of_pred_on_indexed_attr', 'left_cardinality']
print("Using features: ", features)

# Train and visualize models

In [None]:
for i in [0, 10]:  # the two binary joins shown in paper are number 0 and number 10

    ds = one_file_dss[i]
    ds_name = one_file_names[i]
    
    # =========================
    X = ds[features]
    y = ds['optimal_decision']
    X_costs = ds[dl.regression_targets]

    from sklearn.model_selection import train_test_split
    from sklearn import preprocessing

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
    X_train_costs, X_test_costs, _, _, = train_test_split(X_costs, y, train_size=0.8, random_state=1)
    X_train, X_test, y_train, y_test = \
        X_train.to_numpy(),X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()

    X_train_weights = calculate_importance_from_costs(X_train_costs.to_numpy())

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    # scaler = preprocessing.StandardScaler().fit(X_test)
    X_test = scaler.transform(X_test)

    X_train_weights = preprocessing.MinMaxScaler().fit_transform(X_train_weights.reshape(-1,1)).flatten()
    # =========================
    
    accs = []
    for idx, model_name in enumerate(weighted_clf_model_name_list):
        model = weighted_clf_model_list[model_name.split('(')[0]]        
        if 'DT' in model_name or 'RF' in model_name:
            max_depth = model_name.split('(')[-1].split(')')[0]
            if max_depth.lower() == 'u':
                max_depth = None
            else:
                max_depth = int(max_depth)
            clf = model().fit(X_train, y_train, sample_weight=X_train_weights, max_depth=max_depth)
        else:
            clf = model().fit(X_train, y_train, sample_weight=X_train_weights, max_iter=1, weight_decay=0.000001)
        print(one_file_names[i])
        print(f"Accuray of {model}: {clf.score(X_test, y_test)}")
        
        if len(features) == 2:
            x_label = features[0] # sel_on_indexed_attr'
            y_label = features[-1]
            if i == 0 and engine=='postgres':
                y_label = 'left_cardinality' # features[-1]
            else:
                y_label = None
            if i == 1 and engine == 'mssql':
                colorbar = False
            else:
                colorbar = False
            plot_2d_decision_boundaries(clf, scaler.inverse_transform(X_train[0:800, :]), X_train_costs.to_numpy()[0:800, :], y_train[0:800], title=f' ', 
                                        x_label=x_label, y_label=y_label, scaler=scaler, 
                                        filename=f'./figures/exp2-viz-decision-space-{engine}-(random_left)-{i}', plot_colorbar=colorbar)
        
        model_accs[model_name].append(clf.score(X_test, y_test))
    