# ASBE - Automatic Stopping for Batch Experiments

> API details.

In [None]:
#hide
from nbdev import *

In [None]:
%nbdev_default_export core

Cells will be exported to asbe.core,
unless a different module is specified after an export flag: `%nbdev_export special.module`


In [None]:
%nbdev_export
import numpy as np

from modAL.models.base import BaseLearner
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from typing import Union, Optional
from copy import deepcopy
from pylift.eval import UpliftEval

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [None]:
%nbdev_export
def random_batch_sampling(classifier, X_pool, n2):
    "Randomly sample a batch from a pool of unlabaled samples"
    n_samples = len(X_pool)
    query_idx = np.random.choice(range(n_samples), size=n2,replace=False)
    return X_pool[query_idx], query_idx

estimator_type = ClassifierMixin
class ASLearner(BaseLearner):
    """A(ctively)S(topping)Learner class for automatic stopping in batch-mode AL"""
    def __init__(self,
                 estimator: estimator_type=None, 
                 query_strategy=None,
                 assignment_fc=None,
                 X_training: np.ndarray = None,
                 t_training: np.ndarray = None,
                 y_training: np.ndarray = None,
                 X_pool: np.ndarray = None
                ) -> None:
        self.estimator = estimator
        self.query_strategy = query_strategy
        self.assignment_fc = assignment_fc
        self.X_training = X_training
        self.y_training = y_training
        self.t_training = t_training
        self.X_pool     = X_pool
        
    def _add_queried_data_class(self, X, t, y):
        self.X_training = np.vstack((self.X_training, X))
        self.t_training = np.concatenate((self.t_training, t))
        self.y_training = np.concatenate((self.y_training, y))
    
    def _update_estimator_values(self):
        self.estimator.__dict__.update(X_training = self.X_training,
                               y_training  =        self.y_training,
                               t_training  =        self.t_training,
                               X_test      =        self.X_pool)

    def teach(self, X_new, t_new, y_new):
        """Teaching new instances to the estimator selected bu the query_strategy
        
        If no `assignment_fc` is added, all selected samples are used
        If assignment function is added, only those instances are used, where
        $\hat{T} = T$
        """
        if self.assignment_fc is None:
            self._add_queried_data_class(X_new, t_test, y_test)
            self.estimator.fit()
    
    def fit(self):
        self._update_estimator_values()
        self.estimator.fit()
        
    def predict(self, X=None):
        if self.X_pool is not None:
            X = self.X_pool
        elif X is None:
            raise Exception("You need to supply an unlabeled pool of instances (with shape (-1,{}))".format(self.X_training.shape[1]))
        self.preds = self.estimator.predict(X)
        return self.preds
    
    def score(self, preds=None, y_true=None, t_true=None, metric = "Qini"):
        """
        Scoring the predictions - either ITE or observed outcomes are needed.
        
        If observed outcomes are provided, the accompanying treatments are also needed.
        """
        if metric == "Qini":
            upev = UpliftEval(t_true, y_true, self.preds[0] if preds is None else preds)
            self.scores = upev
        return self.scores.q1_aqini

In [None]:
%nbdev_export
class ITEEstimator(BaseEstimator):
    """ Class for building a naive estimator for ITE estimation
    """
    def __init__(self,
                 model: estimator_type = None,
                 two_model: bool = False,
                 **kwargs
                ) -> None:
        self.model = model
        self.two_model = two_model

    def fit(self,X_training: np.ndarray = None,
                 t_training: np.ndarray = None,
                 y_training: np.ndarray = None,
                 X_test: np.ndarray = None):
        if X_training is not None:
            self.X_training = X_training
            self.y_training = y_training
            self.t_training = t_training
            self.X_test = X_test
        self.N_training = self.X_training.shape[0]
        # if "N_training" not in self.__dict__:
        #     self.N_training = self.X_training.shape[0]
        if self.two_model:
            self.m1 = deepcopy(self.model)
            control_ix = np.where(self.t_training == 0)[0]
            self.model.fit(self.X_training[control_ix,:],
                           self.y_training[control_ix])
            self.m1.fit(self.X_training[-control_ix,:],
                        self.y_training[-control_ix])
        else:
            self.model.fit(np.hstack((self.X_training,
                                      self.t_training.reshape((self.N_training, -1)))),
                           self.y_training)
            
    def predict(self, X=None):
        if X is None:
            X = self.X_test
        if self.two_model:
            self.y1_preds = self.m1.predict_proba(X)[:,1]
            self.y0_preds = self.model.predict_proba(X)[:,1]
        else:
            N_test = X.shape[0]
            self.y1_preds = self.model.predict_proba(
                                np.hstack((X,
                                np.ones(N_test).reshape(-1,1))))[:,1]
            self.y0_preds = self.model.predict_proba(
                np.hstack((X,
                           np.zeros(N_test).reshape(-1,1))))[:,1]
        return self.y1_preds - self.y0_preds, self.y1_preds, self.y0_preds

In [None]:
X = np.random.normal(size = 1000).reshape((500,2))
t = np.random.binomial(n = 1, p = 0.5, size = 500)
y = np.random.binomial(n = 1, p = 1/(1+np.exp(X[:, 1]*2 + t*3)))
X_test = np.random.normal(size = 200).reshape((100,2))
t_test = np.random.binomial(n = 1, p = 0.5, size = 100)
y_test = np.random.binomial(n = 1, p = 1/(1+np.exp(X_test[:, 1]*2 + t_test*3)))
a = ITEEstimator(LogisticRegression(solver="lbfgs"), two_model = True)
a.fit(X, t, y)
assert type(a.model) == LogisticRegression  # test assigning a model
assert a.X_training.shape  == (500,2)       # test data passing for class
assert a.model.intercept_ is not None

In [None]:
# a = ITEEstimator(RandomForestClassifier(), X, t, y, X_test, two_model = False )

In [None]:
asl = ASLearner(estimator = ITEEstimator(model = RandomForestClassifier()), 
         query_strategy=random_batch_sampling,
         X_training=X,t_training=t,y_training=y,X_pool=X_test)
asl.fit()
ite_pred, y1_pred, y0_pred = asl.predict()
X_sel, query_sel = asl.query(asl.X_pool, n2=10)
assert ite_pred.shape[0] == 100
assert X_sel.shape       == (10,2)

100




In [None]:
asl.teach(X_sel, t_test[query_sel], y_test[query_sel])

In [None]:
asl.score(preds=None, y_true = y_test, t_true = t_test)

0.17830908588830968

In [None]:
n_train = 100
p       = 5
n_test  = 10000
n2      = 100
X_train = np.random.normal(size = n_train*p).reshape((n_train,p))
t_train = np.random.binomial(n = 1, p = 0.5, size = n_train)
y_train = np.random.binomial(n = 1, p = 1/(1+np.exp(-1*(X_train[:, 1]*2 + t_train*3))))
X_test = np.random.normal(size = n_test*p).reshape((n_test,p))
t_test = np.random.binomial(n = 1, p = 0.5, size = n_test)
y_test = np.random.binomial(n = 1, p = 1/(1+np.exp(-1*(X_test[:, 1]*2 + t_test*3))))
asl = ASLearner(estimator = ITEEstimator(model = RandomForestClassifier(),two_model=True), 
         query_strategy=random_batch_sampling,
         X_training = X_train,
         t_training = t_train,
         y_training = y_train,
         X_pool     = X_test)
asl.fit()
p_ite, p_y1, p_y0 = asl.predict()
print("Qini before AL: {}".format(asl.score(preds=p_ite, y_true=y_test, t_true=t_test)))
qini_vals = []
for _ in range(20):
    X_query, ix = asl.query(asl.X_pool, n2=n2)
    asl.teach(X_query, t_test[ix], y_test[ix])
    asl.X_pool = np.delete(asl.X_pool,ix, axis=0)
    t_test     = np.delete(t_test,ix, axis=0) 
    y_test     = np.delete(y_test,ix, axis=0) 
    p_ite, p_y1, p_y0 = asl.predict()
    qini_vals.append(asl.score(preds=p_ite, y_true=y_test, t_true=t_test))
    print("Qini after round {} of AL: {}".format(_,qini_vals[_]))




Qini before AL: 0.0865249263039237
Qini after round 0 of AL: 0.07969085535285128
Qini after round 1 of AL: 0.030711758911886804
Qini after round 2 of AL: 0.09015372672568754
Qini after round 3 of AL: 0.0791642466933008
Qini after round 4 of AL: 0.0636461787882324
Qini after round 5 of AL: 0.061155291557273304
Qini after round 6 of AL: 0.09191226949719326
Qini after round 7 of AL: 0.09802369165247767
Qini after round 8 of AL: 0.10056081509457777
Qini after round 9 of AL: 0.05421006027385339
Qini after round 10 of AL: 0.08981421305961766
Qini after round 11 of AL: 0.039792676387982395
Qini after round 12 of AL: 0.0912898335604538
Qini after round 13 of AL: 0.07914796743456291
Qini after round 14 of AL: 0.07541826402322561
Qini after round 15 of AL: 0.07643302998556972
Qini after round 16 of AL: 0.09195039121576942
Qini after round 17 of AL: 0.08277269178144729
Qini after round 18 of AL: 0.062422460062035855
Qini after round 19 of AL: 0.06573821256013185
