In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Subset, ConcatDataset, Dataset
from sklearn.model_selection import ShuffleSplit
from torch import nn, optim
from torcheval import metrics
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
import requests
import os
from sklearn.datasets import load_svmlight_file

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Getting the data
n_features = 180
budget = 200
n_classes = 3

In [3]:
class Dna(Dataset):
    dataset_name = "dna"
    feature_encoder =  FunctionTransformer(lambda x: x)
    target_encoder = OneHotEncoder(sparse_output=False)
    visualize = False
    configs = None
    random_seed = 42
    urls_dict = {"train":"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/dna.scale.tr",
                "val":"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/dna.scale.val",
                "test":"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/dna.scale.t"}
    def __init__(self, split_name):
        super().__init__()
        self.location = "datasets/data/" + self.dataset_name
        if not os.path.exists(self.location):
            os.makedirs(self.location)
        self.split_name = split_name

        if not self.file_exists():
            data = self.obtain()
            data = self.split(data)
            data = self.preprocess(data)
            self.save_npz(data)            

        self.load_clean()
        
    def __len__(self):
        return self.x.shape[0]
        
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    def file_exists(self):
        splits = ["train.npz", "test.npz"]
        existed_files = os.listdir(self.location)
        return all([elem in existed_files for elem in splits])
    
    def load_clean(self):
        with np.load(f"{self.location}/{self.split_name}.npz", allow_pickle=True) as file:
            self.x = file["x"].astype(np.float32)
            self.y = file["y"].astype(np.float32)
    
    def save_npz(self, data_dict):
        for split_name, data in data_dict.items():
            with open(f"{self.location}/{split_name}.npz", "wb") as f:
                np.savez(f, x=data["x"], y=data["y"])   

    def preprocess(self, data):
        for data_shard in data.values():
            x, y = data_shard["x"], data_shard["y"]
            self.feature_encoder.fit(x)
            self.target_encoder.fit(y)
            
        for shard_name, data_shard in data.items():
            x, y = data_shard["x"], data_shard["y"]
            data[shard_name]["x"] = self.feature_encoder.transform(x)
            data[shard_name]["y"] = self.target_encoder.transform(y)
        return data
    def split(self, data):
        data["train"]["x"] = np.concatenate((data["train"]["x"], data["val"]["x"].copy()))
        data["train"]["y"] = np.concatenate((data["train"]["y"], data["val"]["y"].copy()))

        data.pop("val", None)
        return data
    def obtain(self):
        data = {}
        for split_name, url in self.urls_dict.items():
            file_path = f"{self.location}_{split_name}_raw"
            with open(file_path, 'w') as f:
                r = requests.get(url)
                f.writelines(r.content.decode("utf-8"))
            x, y  = load_svmlight_file(file_path, n_features=n_features)
            data[split_name] = {"x": np.asarray(x.todense(), dtype=np.float32), "y": y.reshape(-1, 1)}
            os.remove(file_path)
        
        return data
    @classmethod
    def get_data_dict(cls):
        return {"train": cls(split_name="train"), 
                "test": cls(split_name="test")}
    
    @staticmethod
    def conv_split(array_size, shares=[0.6, 0.2], seed=42):
        indices = np.arange(array_size)
        idx_to_split = (np.cumsum(shares)*array_size).astype(int)
        np.random.seed(seed)
        permutated_idx = np.random.choice(indices, array_size, replace=False)
        return np.split(permutated_idx, idx_to_split)

    @staticmethod
    def step_split(array_size, val_chunk):
        indices = np.arange(array_size)
        train_idx = np.random.choice(indices[:-val_chunk], indices[:-val_chunk].shape[0], replace=False)
        val_idx = indices[-val_chunk:]
        return train_idx, val_idx

In [12]:
# the pool
class Pool:
    def __init__(self, data, random_seed = 42, val_share=0.25, n_initially_labeled=1000, batch_size=32):
        self.random_seed = random_seed
        self.set_seed(self.random_seed)
        self.data = data
        self.batch_size = batch_size
        self.idx_abs = np.arange(len(self.data["train"].x)) # Absolute index attribute
        self.val_share = val_share
        self.n_initially_labeled = n_initially_labeled
        self.set_seed(self.random_seed)
        self.idx_unviolated_lb = np.random.choice(self.idx_abs, size=self.n_initially_labeled, replace=False)
        self.idx_new_lb = np.array([], dtype=int)
        self.set_seed(self.random_seed)
        self.test_loader = DataLoader(data["test"], batch_size=self.batch_size, shuffle=False)
        self.set_seed(self.random_seed)
        self.splitter = ShuffleSplit(n_splits=6, 
                        test_size=self.val_share,
                        random_state=self.random_seed)

    def __getitem__(self, idx):
        return self.data["train"][idx]
    
    @property
    def drop_last(self):
        # drop last if the number of labeled instances is bigger than the batch_size
        return int(self.get_len("unviolated")*(1-self.val_share)) + self.get_len("new_labeled") > self.batch_size 

    @property
    def idx_all_labeled(self):
        return np.append(self.idx_unviolated_lb, self.idx_new_lb)
    
    @property
    def new_lb_dataset(self):
        return Subset(self.data['train'], self.idx_new_lb)
    
    @property
    def unviolated_lb_dataset(self):
        return Subset(self.data['train'], self.idx_unviolated_lb)
    
    @property
    def all_lb_dataset(self):
        return Subset(self.data['train'], self.idx_all_labeled)
    
    @property
    def idx_ulb(self):
        return np.delete(self.idx_abs, self.idx_all_labeled) 
    
    def one_split(self):
        self.set_seed(seed=self.random_seed)
        return next(self.splitter.split(self.unviolated_lb_dataset))
    
    def CV_splits(self):
        self.set_seed(seed=self.const_seed)
        return self.splitter.split(self.unviolated_lb_dataset)
    def get_train_val_loaders(self, unviolated_train_idx, unviolated_val_idx):
        unviolated_train_ds = Subset(self.unviolated_lb_dataset, unviolated_train_idx)
        unviolated_val_ds = Subset(self.unviolated_lb_dataset, unviolated_val_idx)

        self.set_seed(seed=self.random_seed)
        train_loader = DataLoader(ConcatDataset((unviolated_train_ds, self.new_lb_dataset)),
                                batch_size=self.batch_size, 
                                drop_last=self.drop_last,
                                shuffle=True)
        
        val_loader = DataLoader(unviolated_val_ds, batch_size=self.batch_size, shuffle=False)
        return train_loader, val_loader

    def get_len(self, pool="total"):
        return len(self.get(pool)[0])
    
    def add_new_inst(self, idx):
        assert len(self.idx_ulb)
        self.idx_new_lb = np.append(self.idx_new_lb, idx)

    def get(self, pool):
        if pool == "all_labeled":
            return self[self.idx_all_labeled]
        elif pool == "unviolated":
            return self[self.idx_unviolated_lb] 
        elif pool == "new_labeled":
            return self[self.idx_new_lb] 
        elif pool == "unlabeled":
            return self[self.idx_ulb] 
        elif pool == "total":
            return self[:]
        elif pool == "test":
            return self.data["test"][:]
        else:
            raise NameError("There is no such name in the pool")
        
    def set_seed(self, seed=None):
        seed = self.random_seed
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
    

In [5]:

# The Acquisition function
class Random():
    def __init__(self, 
                clf,
                pool,
                random_seed = 42,
                budget = budget):
        self.clf = clf       
        self.pool = pool
        self.random_seed = random_seed
        self.budget = budget
    def get_scores(self, values=None):
        if values is None:
            values = self.pool.get_len("unlabeled")
        else:
            values = values[:, 0].ravel().shape[0]
        return np.random.random(values)
    def query(self):
        all_scores = self.get_scores()
        max_scores = np.argwhere(np.isclose(all_scores, all_scores.max())).ravel()            
        self.pool.set_seed(self.random_seed)
        idx = np.random.choice(max_scores, 1)[0]
        return self.pool.idx_ulb[idx]

In [24]:
class NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential()
        self.layers.add_module(f"dense_0", nn.Linear(n_features, n_features//5))
        self.layers.add_module(f"activation_0", nn.ReLU())
        self.layers.add_module(f"dense_1", nn.Linear(n_features//5, n_features//10))
        self.layers.add_module(f"activation_1", nn.ReLU())
        self.layers.add_module(f"dense_2", nn.Linear(n_features//10, n_classes))
        self.layers.add_module(f"activation_2", nn.Softmax())
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(self.parameters(), lr=0.01, weight_decay=0.01)
        self.metric = metrics.MulticlassAccuracy(num_classes=n_classes)


    def forward(self, x):
        return self.layers(x)
    
    def calculate_accuracy(self, y_pred, y_true):
        self.metric.update(y_pred, y_true)

        # compute the metric
        accuracy = self.metric.compute()

        return accuracy

In [22]:
class OnlineAvg:
    def __init__(self, val=0):
        self.n = 1
        self.val = float(val)
        
    def __add__(self, other):
        self.val = self.val + (other-self.val)/self.n
        self.n += 1
        return self
    
    def __repr__(self):
        return str(self.val)
    
    def __float__(self):
        return self.val
    
    def __int__(self):
        return self.n
    
    def __sub__(self, other):
        return float(self.val - float(other))
    
    def __le__(self, other):
        return self.val <= float(other)
    
    def __lt__(self, other):
        return self.val < float(other)

    def __truediv__(self, other):
        return self.val / float(other)

In [15]:
class Learnable():
    def __init__(self, 
                pool,
                random_seed = 42,
                n_warmup_epochs=100,
                patience=20,
                epochs=200):
        
        self.random_seed = random_seed
        self.n_warmup_epochs = n_warmup_epochs
        self.epochs = epochs
        self.patience = patience
        self.pool = pool
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = NN()   
        
    def __call__(self, x, mc_dropout=False):
        if mc_dropout:
            self.model.train()
        else:
            self.model.eval()
        with torch.no_grad():
            return self.model(x.to(self.device))
    
    def train_model(self):
        unviolated_train_idx, unviolated_val_idx = self.pool.one_split()
        train_loader, val_loader = self.pool.get_train_val_loaders(unviolated_train_idx, unviolated_val_idx)
        train_perf, val_perf = self.fit(train_loader=train_loader, val_loader=val_loader)
        test_perf, _ = self.eval(loader=self.pool.test_loader)
        return train_perf, val_perf, test_perf

    def eval(self, loader):
        self.model.eval()
        total_loss = OnlineAvg()
        with torch.no_grad():
            for inputs, targets in loader:
                targets = targets.to(self.device)
                inputs = inputs.to(self.device)
                predictions = self(inputs.float())
                batch_loss = self.model.criterion(predictions, targets)
                total_loss += batch_loss.item()
                acc = self.model.calculate_accuracy(predictions, targets)  
        return total_loss, acc
    
    def fit(self, train_loader, val_loader):
        self.model.train()      
        train_loss = OnlineAvg()  
        for epoch_num in range(self.epochs):
            for inputs, targets in train_loader:
                targets = targets.to(self.device)
                inputs = inputs.to(self.device)
                predictions = self.model(inputs.float())
                batch_loss = self.model.criterion(predictions, targets.float())
                train_loss += batch_loss.item()
                self.model.zero_grad()
                batch_loss.backward()
                self.model.optimizer.step()

            train_loss, train_metrics = self.eval(loader=train_loader)
            val_loss, val_metrics = self.eval(val_loader)
        return (train_loss, train_metrics),  (val_loss, val_metrics)


In [10]:
class ActiveLearning():
    def __init__(self):
        self.budget = budget
        self.random_seed = 42
        self.pool = Pool(data=Dna.get_data_dict())
        self.clf = Learnable(pool=self.pool)
        self.acq = Random(clf=self.clf, pool=self.pool)
    
    def show_intermediate_results(self, abs_idx, train_perf, val_perf, test_perf):
        print(f'{abs_idx} {self.pool.get_len("all_labeled")} {self.pool.get_len("unlabeled")}\n{train_perf}\n{val_perf}\n{test_perf}')
    
    def train_first_hypers(self):
        train_perf, val_perf, test_perf = self.clf.train_model()
        print(f"Initial {train_perf}, {val_perf}, {test_perf}")
        return train_perf, val_perf, test_perf
    
    def run(self):
        abs_idx = None
        train_perf, val_perf, test_perf = self.train_first_hypers()
        for iteration in range(0, self.budget):
            abs_idx = self.acq.query()
            self.pool.add_new_inst(abs_idx)
            train_perf, val_perf, test_perf = self.clf.train_model()
        print(f"final {train_perf}, {val_perf}, {test_perf}, {abs_idx}")

In [25]:
ActiveLearning().run()

  return self._call_impl(*args, **kwargs)


ValueError: target should be a one-dimensional tensor, got shape torch.Size([32, 3]).