In [1]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import random
import torchvision.transforms as transforms
import torchvision
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [2]:
from scipy import stats

import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt

import random

In [3]:
class UNet(nn.Module):
    def __init__(self, embedding_size=64):
        super(UNet, self).__init__()

        # Define the encoder part
        self.enc_conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.enc_conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)

        # Define the bottleneck part
        self.bottleneck_conv = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(7*7*64, embedding_size)

    def forward(self, x):
        # Encoder
        x = F.relu(self.enc_conv1(x))
        x = self.pool(x)
        x = F.relu(self.enc_conv2(x))
        x = self.pool(x)

        # Bottleneck
        x = F.relu(self.bottleneck_conv(x))
        x = self.flatten(x)
        x = self.fc(x)

        return x

model = UNet()
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)
def create_binary_dataset(train_dataset, prop_1_to_0):
    """
    prop_1_to_0:  number of images of 1 / number of images of 0
    """
    if prop_1_to_0 > 1:
        raise ValueError("There should be more zeros than ones")
    zero_data = [data for data in train_dataset if data[1] == 0]
    one_data = [data for data in train_dataset if data[1] == 1]
    num_ones = int(len(zero_data) * prop_1_to_0)
    combined_data = zero_data + one_data[:num_ones]
    random.shuffle(combined_data)
    return combined_data
def split_data(train_dataset):
    data = {}
    for x in range(10):
        data[x] = [data for data in train_dataset if data[1] == x]
        random.shuffle(data[x])
    return data
def create_biased_dataset(data, probs):
    '''
    probs: [p_0, p_1, ..., p_9]
    returns a biased dataset according to probs
    '''
    max_p = max(probs)
    max_x = np.argmax(probs)
    max_n = 0.9 * len(data[max_x])
    res = []
    for x, lis in data.items():
        n = int(max_n * probs[x] / max_p)
        res += data[x][:n]
    random.shuffle(res)
    return res
def encode_data(dataset):
  model.eval() # UNet
  return [(model(image.unsqueeze(0)), label) for image, label in dataset]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|███████████████████████████| 9912422/9912422 [00:00<00:00, 10919327.64it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|███████████████████████████████| 28881/28881 [00:00<00:00, 33817893.31it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|███████████████████████████| 1648877/1648877 [00:00<00:00, 15387933.76it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████████████████████████████| 4542/4542 [00:00<00:00, 9102020.43it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [4]:
def evaluate_model(data):
    # Load and process the dataset
    

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Balance the training set
    class_counts = np.bincount(y_train.astype(int))
    minority_class = np.argmin(class_counts)
    resample_count = class_counts[1 - minority_class] - class_counts[minority_class]
    X_train_minority = X_train[y_train == minority_class]
    y_train_minority = y_train[y_train == minority_class]
    X_train_minority_upsampled, y_train_minority_upsampled = resample(X_train_minority, y_train_minority, replace=True, n_samples=resample_count, random_state=42)
    X_train_balanced = np.vstack((X_train, X_train_minority_upsampled))
    y_train_balanced = np.append(y_train, y_train_minority_upsampled)

    # Train the RandomForestClassifier
    classifier = RandomForestClassifier()
    classifier.fit(X_train_balanced, y_train_balanced)

    # Adjust classifier probabilities
    target_class_counts = np.bincount(y_test.astype(int))
    target_majority_proportion = target_class_counts[1 - minority_class] / len(y_test)
    target_minority_proportion = target_class_counts[minority_class] / len(y_test)
    majority_proba_ratio = target_majority_proportion / 0.5  # Training set is balanced
    minority_proba_ratio = target_minority_proportion / 0.5  # Training set is balanced
    y_proba = classifier.predict_proba(X_test)
    y_proba[:, 1 - minority_class] *= majority_proba_ratio
    y_proba[:, minority_class] *= minority_proba_ratio
    y_proba /= y_proba.sum(axis=1, keepdims=True)
    y_pred = np.argmax(y_proba, axis=1)

    # Calculate and return evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    return accuracy, recall, precision

In [5]:
evaluate_model(train_dataset)

NameError: name 'X' is not defined