# Baseline


In [1]:
!pip install openml



In [2]:
from __future__ import annotations

from dataclasses import dataclass
from typing import Iterable

import numpy as np
import openml
import pandas as pd
from openml import OpenMLDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

task_ids = (
    40588,
    40589,
    40590,
    40591,
    40592,
    40593,
    40594,
    40595,
    40596,
    40597,
)


@dataclass
class Split:
    X: np.ndarray
    y: np.ndarray


@dataclass
class Dataset:
    name: str
    id: int
    features: pd.DataFrame
    labels: pd.DataFrame
    openml: OpenMLDataset
    encoders: dict[str, LabelEncoder]

    def split(
        self,
        splits: Iterable[float],
        seed: int | None = 1,
    ) -> tuple[Split, ...]:
        """Create splits of the data

        Parameters
        ----------
        splits : Iterable[float]
            The percentages of splits to generate

        seed : int | None = None
            The seed to use for the splits

        Returns
        -------
        tuple[Split, ...]
            The collected splits
        """
        splits = list(splits)
        assert abs(1 - sum(splits)) <= 1e-6, "Splits must sum to 1"

        sample_sizes = tuple(int(s * len(self.features)) for s in splits)

        collected_splits = []

        next_xs = self.features.to_numpy()
        next_ys = self.labels.to_numpy()

        for size in sample_sizes[:-1]:
            xs, next_xs, ys, next_ys = train_test_split(
                next_xs, next_ys, train_size=size, random_state=seed
            )
            collected_splits.append(Split(X=xs, y=ys))
        collected_splits.append(Split(X=next_xs, y=next_ys))

        return tuple(collected_splits)

    @staticmethod
    def from_openml(id: int) -> Dataset:
        """Processes an multilabel OpenMLDataset into its features and targets

        Parameters
        ----------
        id: int
            The id of the dataset

        Returns
        -------
        Dataset
        """
        dataset = openml.datasets.get_dataset(id)
        print(dataset.name, id)
        targets = dataset.default_target_attribute.split(",")
        data, _, _, _ = dataset.get_data()

        assert isinstance(data, pd.DataFrame)

        # Process the features and turn all categorical columns into ints
        features = data.drop(columns=targets)
        encoders: dict[str, LabelEncoder] = {}

        for name, col in features.iteritems():
            if col.dtype in ["object", "category", "string"]:
                encoder = LabelEncoder()
                features[name] = encoder.fit_transform(col)
                encoders[name] = encoder

        labels = data[targets]

        # Since we assume binary multilabel data, we convert the labels
        # to all be boolean types
        labels = labels.astype(bool)

        return Dataset(
            name=dataset.name,
            id=id,
            features=features,
            labels=labels,
            openml=dataset,
            encoders=encoders,
        )


if __name__ == "__main__":
    # Open the first dataset in a browser
    first = task_ids[0]
    dataset = Dataset.from_openml(first)
    dataset.openml.open_in_browser()

    train, val, test = dataset.split(splits=(0.6, 0.2, 0.2))
    print(train.X.shape, val.X.shape, test.X.shape)

from sklearn.metrics import f1_score as sklearn_f1_score
import numpy as np

def f1_score(y_true: np.ndarray, y_pred:np.ndarray) -> float:
    return sklearn_f1_score(y_true, y_pred, average="macro", zero_division=0)

birds 40588
(387, 260) (129, 260) (129, 260)


In [3]:
from __future__ import annotations

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


class RandomForestBaseline:
    def __init__(self, seed: int | None = 1):
        self.seed = seed
        self.estimator = RandomForestClassifier(random_state=seed)

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self.estimator.fit(X, y)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.estimator.predict(X)


if __name__ == "__main__":
    seed = 42
    scores: dict[str, float] = {}

    for id in task_ids:
        dataset = Dataset.from_openml(id)
        print(f"Training on {dataset.name}")

        train, test = dataset.split(splits=(0.75, 0.25), seed=seed)
        rf = RandomForestBaseline(seed=seed)

        rf.fit(train.X, train.y)

        predictions = rf.predict(test.X)
        score = f1_score(test.y, predictions)
        print(f"Baseline score = {score}")

        scores[dataset.name] = score

    results = pd.Series(scores)
    print(results)


birds 40588
Training on birds
Baseline score = 0.13217721723664
emotions 40589
Training on emotions
Baseline score = 0.6246485178080441
enron 40590
Training on enron
Baseline score = 0.15427559488991852
genbase 40591
Training on genbase
Baseline score = 0.7160493827160493
image 40592
Training on image
Baseline score = 0.4717961385901015
langLog 40593
Training on langLog
Baseline score = 0.007548387096774194
reuters 40594
Training on reuters
Baseline score = 0.5495908695133818
scene 40595
Training on scene
Baseline score = 0.6922819362654442
slashdot 40596
Training on slashdot
Baseline score = 0.2335916043894742
yeast 40597
Training on yeast
Baseline score = 0.32679408884648337
birds       0.132177
emotions    0.624649
enron       0.154276
genbase     0.716049
image       0.471796
langLog     0.007548
reuters     0.549591
scene       0.692282
slashdot    0.233592
yeast       0.326794
dtype: float64


# Neural Network multi-label classification

In [4]:
# mlp for multi-label classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score

# get the dataset
def get_dataset():
	X, y = make_multilabel_classification(n_samples=1000, n_features=10, n_classes=3, n_labels=2, random_state=1)
	return X, y

# get the model
def get_model(n_inputs, n_outputs):
	model = Sequential()
	model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
	model.add(Dense(20, kernel_initializer='he_uniform', activation='relu'))
	model.add(Dense(n_outputs, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam')
	return model

# evaluate a model using repeated k-fold cross-validation
def evaluate_model(X, y):
	results = list()
	n_inputs, n_outputs = X.shape[1], y.shape[1]
	# define evaluation procedure
	cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
	# enumerate folds
	for train_ix, test_ix in cv.split(X):
		# prepare data
		X_train, X_test = X[train_ix], X[test_ix]
		y_train, y_test = y[train_ix], y[test_ix]
		# define model
		model = get_model(n_inputs, n_outputs)
		# fit model
		model.fit(X_train, y_train, verbose=0, epochs=100)
		# make a prediction on the test set
		yhat = model.predict(X_test)
		# round probabilities to class labels
		yhat = yhat.round()
		# calculate accuracy
		acc = f1_score(y_test, yhat)
		# store result
		print('>%.3f' % acc)
		results.append(acc)
	return results



ModuleNotFoundError: No module named 'tensorflow'

In [None]:
seed = 42
scores: dict[str, float] = {}

for id in task_ids:
    dataset = Dataset.from_openml(id)
    print(f"Training on {dataset.name}")

    train, test = dataset.split(splits=(0.75, 0.25), seed=seed)
    n_inputs, n_outputs = len(dataset.features.columns), len(dataset.labels.columns)

    model = get_model(n_inputs, n_outputs)
    # fit model
    model.fit(train.X,train.y, verbose=0, epochs=1000)
    # make a prediction on the test set
    yhat = model.predict(test.X)
    # round probabilities to class labels
    yhat = yhat.round()
    # calculate accuracy
    score = f1_score(test.y, yhat)
    print(f"Baseline score = {score}")

    scores[dataset.name] = score

results = pd.Series(scores)
print(results)


birds 40588
Training on birds
Baseline score = 0.04758107389686337
emotions 40589
Training on emotions
Baseline score = 0.6430151745000587
enron 40590
Training on enron
Baseline score = 0.18795163737722445
genbase 40591
Training on genbase
Baseline score = 0.7138707334785765
image 40592
Training on image
Baseline score = 0.4772824633980804
langLog 40593
Training on langLog
Baseline score = 0.05308005673949399
reuters 40594
Training on reuters
Baseline score = 0.6360635900215447
scene 40595
Training on scene


KeyboardInterrupt: ignored

#Pytorch version


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import pandas as pd

In [None]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self,input_dim,output_dim):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, 20),
            nn.ReLU(),
            nn.Linear(20, 20),
            nn.ReLU(),
            nn.Linear(20,output_dim),
            nn.Sigmoid()

        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits



Using cuda device


In [None]:
from torch.functional import split
class MyDataset(Dataset):
 
  def __init__(self,split):
 
    x=np.array(split.X)
    y=np.array(split.y)
 
    self.x_train=torch.tensor(x,dtype=torch.float32)
    self.y_train=torch.tensor(y,dtype=torch.float32)
 
  def __len__(self):
    return len(self.y_train)
   
  def __getitem__(self,idx):
    return self.x_train[idx],self.y_train[idx]

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # if batch % 100 == 0:
        #     loss, current = loss.item(), batch * len(X)
        #     print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    #print("num batches: "+str(num_batches))
    #print("size: "+str(size))
    model.eval()
    f1 = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            yhat = pred.round()
            # calculate accuracy
            f1 = f1_score(y.detach().cpu().numpy(), yhat.detach().cpu().numpy())
    f1 /= num_batches
    print("F1: " + str(f1))
    return f1
    

In [None]:
seed = 42
scores: dict[str, float] = {}

data = dict()

loss_fn = nn.BCELoss()

for id in task_ids:
    dataset = Dataset.from_openml(id)
    print(f"Training on {dataset.name}")

    data[dataset.id] = dict()
    data[dataset.id]["dataset"] = dataset.name

    train_split, test_split = dataset.split(splits=(0.75, 0.25), seed=seed)
    n_inputs, n_outputs = len(dataset.features.columns), len(dataset.labels.columns)
    model = NeuralNetwork(n_inputs,n_outputs).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    train_dataset = MyDataset(train_split)
    test_dataset = MyDataset(test_split)
   
    batch_size = 64

    # Create data loaders.
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(test_dataset, batch_size=test_dataset.__len__())

    epochs = 1000
    for t in range(epochs):
        train(train_dataloader, model, loss_fn, optimizer)
    score = test(test_dataloader,model, loss_fn)
    scores[dataset.name] = score

    data[dataset.id]["score"] = score

results = pd.Series(scores)
print(results)

birds 40588
Training on birds
F1: 0.19133835385912948
emotions 40589
Training on emotions
F1: 0.6669559412654514
enron 40590
Training on enron
F1: 0.18228530642129812
genbase 40591
Training on genbase
F1: 0.7242798353909465
image 40592
Training on image
F1: 0.5384395657761082
langLog 40593
Training on langLog
F1: 0.06936227481997208
reuters 40594
Training on reuters
F1: 0.6474094064979523
scene 40595
Training on scene
F1: 0.6965709355692465
slashdot 40596
Training on slashdot
F1: 0.2960032043864161
yeast 40597
Training on yeast
F1: 0.40822282176965224
birds       0.191338
emotions    0.666956
enron       0.182285
genbase     0.724280
image       0.538440
langLog     0.069362
reuters     0.647409
scene       0.696571
slashdot    0.296003
yeast       0.408223
dtype: float64
