In [1]:
import sklearn
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from vflow import Vset, init_args, dict_to_df, perturbation_stats
from vflow.pipeline import build_graph
import matplotlib.pyplot as plt
import ray

  from mlflow.tracking import MlflowClient


In [2]:
X, y = make_classification() # make sample dataset
X_train, X_test, y_train, y_test = init_args(
    train_test_split(X, y, test_size=0.2),
    names=["X_train", "X_test", "y_train", "y_test"],  # optionally name the args
)

In [3]:
X_train

{(X_train,): array([[-0.92391161, -0.36116888,  1.33713111, ...,  0.04755302,
          1.16148162,  0.31391534],
        [ 1.05004849, -0.17554274,  0.39877443, ..., -0.82537544,
          0.83709694, -1.05517875],
        [ 1.13991663,  0.22212712,  1.0174011 , ..., -0.65693381,
          0.41189582, -0.70887163],
        ...,
        [-1.01071731, -0.14508865,  1.35044439, ...,  1.11046153,
         -0.75591959,  0.56970427],
        [ 0.5075978 ,  0.6147053 ,  0.53535289, ..., -1.00350005,
          0.90777755, -0.27290012],
        [ 1.07571739, -0.02717265,  1.45488906, ..., -0.47514019,
          1.54323294, -0.43500394]]),
 '__prev__': ('init',)}

In [4]:
subsampling_funcs = [sklearn.utils.resample for _ in range(3)]
subsampling_set = Vset(
    name="subsampling", vfuncs=subsampling_funcs, output_matching=True
)
# output_matching=True so that the Vset will have the same outputs on training
# and test data
subsampling_set

<vflow.vset.Vset at 0x7fac52e49860>

In [5]:
X_trains, y_trains = subsampling_set(X_train, y_train)
X_trains

{(X_train,
  subsampling_0): array([[-0.61124669, -0.34016947, -0.21589863, ..., -0.5680841 ,
          0.03494742,  0.14135262],
        [ 1.13269772, -1.52763505, -1.3022241 , ...,  1.5394207 ,
          0.63048568, -1.07441884],
        [ 1.47789373, -0.42590758,  0.26434261, ...,  0.97098091,
          0.19530823, -1.13025104],
        ...,
        [ 1.22388508, -1.5947687 ,  0.32812612, ..., -0.32182171,
          0.51623768, -1.07703761],
        [ 1.52317824,  0.98099065, -0.47296172, ..., -0.21478086,
         -1.07523587, -0.82055207],
        [-1.71286685,  0.12970712,  0.4770702 , ..., -0.15261702,
         -0.05698782,  1.43882766]]),
 (X_train,
  subsampling_1): array([[-1.80797577,  1.84193122,  0.39572607, ..., -0.05453989,
          1.26918215,  1.46764011],
        [-0.92391161, -0.36116888,  1.33713111, ...,  0.04755302,
          1.16148162,  0.31391534],
        [ 1.05154853,  1.26221203,  0.3150243 , ...,  0.17852263,
         -1.14815092, -0.70433194],
        ...

In [9]:
import sys
print(sys.executable)

/opt/miniconda3/envs/215a/bin/python


[2m[33m(raylet)[0m [2024-11-04 14:49:35,214 E 53595 2217346] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-11-04_14-44-59_566559_53558 is over 95% full, available space: 2712297472; capacity: 250685575168. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2024-11-04 14:49:45,282 E 53595 2217346] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-11-04_14-44-59_566559_53558 is over 95% full, available space: 2712051712; capacity: 250685575168. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2024-11-04 14:49:55,358 E 53595 2217346] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-11-04_14-44-59_566559_53558 is over 95% full, available space: 2711781376; capacity: 250685575168. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2024-11-04 14:50:05,438 E 53595 2217346] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-11-04_14-44-59_566559_53558 is over 95% full, available sp

In [6]:
y_trains

{(y_train,
  subsampling_0): array([1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
        0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
        1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
        0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]),
 (y_train,
  subsampling_1): array([0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
        0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
        0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1]),
 (y_train,
  subsampling_2): array([1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
        1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1]),
 '__prev__': (<vflow.vset.Vset at 0x7fac52e49860>, ('init',))}

In [7]:
# define a Vset of models
models = [LogisticRegression(), DecisionTreeClassifier()]


# create a Vset of models. is_async=True so that the models are fit in parallel
modeling_set = Vset(name="modeling", vfuncs=models, vfunc_keys=["LR", "DT"], is_async=True)

# fit the models

# create a Ray cluster with 4 cpus
ray.init(num_cpus=4)
modeling_set.fit(X_trains, y_trains)
preds_test = modeling_set.predict(X_test)
ray.shutdown()
preds_test

2024-11-04 14:45:03,380	INFO worker.py:1625 -- Started a local Ray instance.


TypeError: Attempting to call `get` on the value [1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0], which is not an ray.ObjectRef.

In [None]:
# get metrics
binary_metrics_set = Vset(
    name="binary_metrics",
    vfuncs=[accuracy_score, balanced_accuracy_score],
    vfunc_keys=["Acc", "Bal_Acc"],
)
binary_metrics = binary_metrics_set.evaluate(preds_test, y_test)
binary_metrics

In [None]:
df = dict_to_df(binary_metrics)
df

In [None]:
perturbation_stats(df, 'modeling', 'binary_metrics')

In [None]:
G = build_graph(binary_metrics, draw=True)
plt.show()

[2m[33m(raylet)[0m [2024-11-04 14:45:13,389 E 53595 2217346] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-11-04_14-44-59_566559_53558 is over 95% full, available space: 3807969280; capacity: 250685575168. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2024-11-04 14:45:23,461 E 53595 2217346] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-11-04_14-44-59_566559_53558 is over 95% full, available space: 3807174656; capacity: 250685575168. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2024-11-04 14:45:33,534 E 53595 2217346] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-11-04_14-44-59_566559_53558 is over 95% full, available space: 3806486528; capacity: 250685575168. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2024-11-04 14:45:43,606 E 53595 2217346] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-11-04_14-44-59_566559_53558 is over 95% full, available sp