In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from tqdm import tqdm
import sys
import os
import time

# plotting imports
import matplotlib.pyplot as plt

# ml imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

# random forest
from sklearn.ensemble import RandomForestClassifier
# k nearest neighbors
from sklearn.neighbors import KNeighborsClassifier

# metric imports

# cnn imports
import torch
from torch import nn
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, random_split
import torch.optim as optim
from torchvision.transforms import ToTensor

sys.path.append('../')
from model import model3

# Load Data

In [None]:
# load cancer dataset from sklearn
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

# randomly rearrange data
random_state = 42
np.random.seed(random_state)
indices = np.random.permutation(len(X))
X = X[indices]
y = y[indices]

# split data
test_size = 0.2
train_size = 1 - test_size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Global Variables

In [None]:
dataset_name = "cancer"
repeat = 5
datetime = dt.datetime.now().strftime("%Y%m%d_%H%M")
data_sizes = [50, 100, 500, 1000, None]
benchmarking = {}
info_length = 3 # train_acc, test_acc, time

## Helper functions

In [None]:
def save_data(data, model_name, data_size, info_type, iteration, val=False):
    train = "train" if not val else "val"
    path = f"results/{dataset_name}/{model_name}/npy_files/d{data_size}_{train}_{info_type}_i{iteration}_d{datetime}.npy"
    np.save(path, data)

def load_data(model_name, data_size, info_type, iteration, val=False):
    train = "train" if not val else "val"
    date_format = "%Y%m%d_%H%M"
    dir = f"results/{dataset_name}/{model_name}/npy_files"#d{data_size}_{train}_{info_type}_i{iteration}_d{datetime}.npy"
    partial_name = f"d{data_size}_{train}_{info_type}_i{iteration}"

    # get all the datetimes
    datetimes = [name.split("_")[-1].split(".")[0] for name in os.listdir(dir) if partial_name in name]
    # get the latest datetime by converting to datetime object
    latest_datetime = max([dt.datetime.strptime(date, date_format) for date in datetimes]).strftime(date_format)
    path = os.path.join(dir, f"{partial_name}_d{latest_datetime}.npy")
    try:
        return np.load(path)
    except:
        return None

# Model Training

In [None]:
def benchmark_ml(model_name):
    """
    Trains a model on the cancer dataset with different data sizes and saves the accuracy and time data.

    Parameters:
    model_name (str): The name of the model to train. Can be "randomforest", "knn", or "ours".

    Returns:
    results_dict (dict): A dictionary containing the accuracy and time data for each model and iteration
    """
    model = RandomForestClassifier(n_jobs=-1) if model_name == "randomforest" else \
            KNeighborsClassifier(n_jobs=-1) if model_name == "knn" else \
            model3() if model_name == "ours" else None
    results_dict = {model_name: {}}
    
    progress = tqdm(total=repeat*len(data_sizes), desc=f"Benchmarking {model_name}")
    for i in range(repeat):
        time_list = []
        train_acc = []
        val_acc = []

        for size in data_sizes:
            if size is None:
                size = len(X_train)
            # train model
            start_time = time.perf_counter()
            clf = model
            clf.fit(X_train[:size], y_train[:size])
            y_pred = clf.predict(X_test)
            end_time = time.perf_counter()

            # predict and compute accuracy
            y_pred_train = clf.predict(X_train[:size])
            y_pred = clf.predict(X_test)
            acc_train = accuracy_score(y_train[:size], y_pred_train)
            acc_test = accuracy_score(y_test, y_pred)

            # update lists
            time_list.append(end_time - start_time)
            train_acc.append(acc_train)
            val_acc.append(acc_test)
            progress.update(1)

        # save data
        train_acc = np.array(train_acc)
        val_acc = np.array(val_acc)
        time_list = np.array(time_list)
        save_data(train_acc, model_name, size, "acc", i)
        save_data(val_acc, model_name, size, "acc", i, val=True)
        save_data(time_list, model_name, size, "time", i)
        results_dict[model_name][i] = {"train_acc": train_acc, "val_acc": val_acc, "time": time_list}
    progress.close()
    return results_dict

## Random Forest

In [None]:
"""
Random Forest does not use epochs and does not compute loss so we are missing
train_loss
val_loss
"""
model_name = 'randomforest'
rfr_results = benchmark_ml(model_name)
benchmarking[model_name] = rfr_results

## K Nearest Neighbors

In [None]:
"""
K Nearest Neighbors does not use epochs and does not compute loss so we are missing
train_loss
val_loss
"""
model_name = 'knn'
knn_results = benchmark_ml(model_name)
benchmarking[model_name] = knn_results

## Convolutional Neural Network

In [None]:
model_name = 'cnn'

## Metric Learning

In [None]:
model_name = 'metric'

## Our Model

In [None]:
"""
Our model does not use epochs and does not compute loss so we are missing
train_loss
val_loss
"""
model_name = 'ours'
our_results = benchmark_ml(model_name)
benchmarking[model_name] = our_results

# Model Evaluation

In [None]:
plt.figure(figsize=(20, 10))
for model_name, model_results in benchmarking.items():
    for iteration, iteration_results in model_results.items():
        for i, info_type, info in enumerate(iteration_results.items()):
            plt.subplot(info_length, 1, i+1)
            plt.plot(data_sizes, info, label=f"{model_name}_i{iteration}")
            plt.xlabel("Data Size")
            plt.ylabel(info_type)
            plt.legend()
plt.title("Model Benchmarking")
plt.tight_layout()
plt.savefig(f"results/{dataset_name}/charts/benchmarking_{datetime}.png")