In [1]:
import os
import dask.dataframe as dd
import numpy as np
import wandb
import pickle
from src.utility.constants import *
from src.utility.util import load_data
import pandas as pd
from src.heuristic.parsing import parse_heuristic
from dask.distributed import Client, LocalCluster, wait
from sklearn.linear_model import LogisticRegression
from cross_validation import cross_validation
import time
from dask import delayed

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


In [2]:
cluster = LocalCluster()  # Launches a scheduler and workers locally
client = Client(cluster)
print(client.dashboard_link)

http://127.0.0.1:8787/status


In [3]:
# Load the map-elites table
with open("artifacts/map-elites-v1/tables.pkl", 'rb') as f:
    tables = pickle.load(f)

# Get unique heuristics
heuristics, _ = tables.get_stored_data(strip_nan=True)
heuristics = list(map(lambda h: parse_heuristic(h, dask=True), heuristics))

In [4]:
@delayed
def delayed_load_data(file):
    df = pd.read_parquet(file)
    df.columns = list(map(lambda col: NORMALIZED_COLUMN_NAMES_MAPPING[col] if col in NORMALIZED_COLUMN_NAMES_MAPPING else col, df.columns))
    return df

# Load parquet files
dfs = []
for file in COMBINED_DATA_FILES:
    dfs.append(delayed_load_data(file))

In [5]:
@delayed(nout=2)
def delayed_execute_heuristic(df, heuristics):
    new_cols = {str(h): h.execute(df) for h in heuristics}
    new_df = df.assign(**new_cols)
    x_cols = NORMALIZED_COLUMN_NAMES + list(new_cols.keys())
    return new_df[x_cols], new_df[CLASSES_2_Y_COLUMN]

Xs = [None for _ in dfs]
ys = [None for _ in dfs]

# Execute heuristics
for i, df in enumerate(dfs):
    Xs[i], ys[i] = delayed_execute_heuristic(df, heuristics)


In [12]:
@delayed
def delayed_mean(df):
    return df.mean()

@delayed
def delayed_variance(df):
    return df.var()

@delayed
def delayed_pooled_mean_and_var(means, variances, lens):
    means = np.array(means)
    variances = np.array(variances)
    lens = np.array(lens)[:, np.newaxis]
    return (
        np.sum(means * lens, axis=0) / np.sum(lens),
        np.sum((lens - 1) * variances, axis=0) / (np.sum(lens) - 1)
    )

lens = [df.shape[0] for df in Xs]
means = [delayed_mean(df) for df in Xs]
variances = [delayed_variance(df) for df in Xs]
data = delayed_pooled_mean_and_var(means, variances, lens)

In [14]:
means, vars = data.compute()

(82,)
[8.12441901e+04 2.12826046e+11 8.00222458e+01 1.96536822e+02
 9.91264316e+09 9.91264316e+09 5.25733927e-05 7.90771298e-02
 1.64346635e-01 8.23133936e-02 8.00495741e-02 1.08195738e-01
 1.47793183e-06 7.28257273e-07 8.20427430e-02 4.40277311e-01
 1.07004854e-01 5.16275329e+03 1.05874712e+05 4.59074739e-02
 5.20630637e-02 1.30662402e-04 2.14193362e-08 6.42580123e-08
 4.09306725e-05 1.49935373e-07 2.44547454e-01 1.67007849e-01
 1.71354430e-06 6.61813965e-05 1.36916240e-01 1.12674480e-04
 1.12674480e-04 6.82932528e+06 1.95147021e+04 2.74607318e+05
 5.80766703e+04 2.57074373e+04 5.83458514e+04 2.90610834e+14
 6.71008769e-01 7.44520624e+01 5.14242597e+04 1.04788145e+11
 5.42891800e-02 4.43871485e+02 5.42891800e-02 3.71812410e+01
 3.08392869e-02 1.01125675e+00 4.28749174e+01 4.29052640e-01
 1.05874712e+05 4.22541387e+01 3.71812410e+01 4.35468537e+06
 4.36242947e-01 4.90584295e+01 1.08195738e-01 4.35463634e+06
 1.46546321e-01 7.71974831e-02 5.01385737e+01 6.00393736e+00
 8.71439702e-02 4.

In [8]:
model = LogisticRegression(n_jobs=-1, warm_start=True)

In [19]:
@delayed
def delayed_normalize(df, mu, std):
    return (df - mu) / std

@delayed
def delayed_train(X, y, model):
    return model.fit(X, y)

@delayed
def delayed_predict(X, model):
    return model.predict(X)

@delayed
def delayed_scores(y_true, y_pred):
    # Get accuracy, precision, recall, and f1 score
    return pd.Series([accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred)], index=['accuracy', 'precision', 'recall', 'f1'])

@delayed
def delayed_mean_scores(scores):
    return pd.concat(scores, axis=1).T.mean()

# Train model
scores = []
for X, Y in zip(Xs, ys):
    X = delayed_normalize(X, means, vars ** 0.5)
    model = delayed_train(X, Y, model)

# model = model.compute()

# Test model
for X, Y in zip(Xs, ys):
    X = delayed_normalize(X, means, vars ** 0.5)
    y_pred = delayed_predict(X, model)
    scores.append(delayed_scores(Y, y_pred))

scores = delayed_mean_scores(scores)


In [20]:
scores.compute()