In [1]:
import sys 
sys.path.append('../..')
# sys.path.append('/Users/patroklos/cox')
sys.path.append('/opt/anaconda3/lib/python3.7/site-packages')
from cox.utils import Parameters
from cox.store import Store
from cox.readers import CollectionReader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import itertools
import numpy as np
import torch as ch
from torch import Tensor
from torch import sigmoid as sig
import torch.nn as nn
from torch.distributions import Gumbel, Uniform
from torch.distributions.multivariate_normal import MultivariateNormal
from torch.distributions.transforms import SigmoidTransform
from torch.distributions.transformed_distribution import TransformedDistribution
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import datetime
from delphi.stats import truncated_logistic_regression
from delphi.oracle import oracle
from delphi import train
from delphi.utils import constants as consts

# set default tensor type 
# ch.set_default_tensor_type(ch.cuda.FloatTensor)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

# Constants

In [2]:
TRUNCATED_STORE_PATH = '/Users/patroklos/MultinomialLogisticRegressionTruncated/'
STANDARD_STORE_PATH = '/Users/patroklos/MultinomialLogisticRegressionStandard/'

TRUNCATED_EVAL_STORE_PATH = '/Users/patroklos/MultinomialLogisticRegressionTruncatedTest/'
STANDARD_EVAL_STORE_PATH = '/Users/patroklos/MultinomialLogisticRegressionStandardTest/'

# Helper Functions

In [3]:
# membership oracles
class DNN_Lower(oracle): 
    """
    Lower bound truncation on the DNN logits.
    """
    def __init__(self, lower): 
        self.lower = lower
        
    def __call__(self, x): 
        return (x > self.lower).float()
    
class DNN_Logit_Ball(oracle): 
    """
    Truncation ball placed on DNN logits.
    INTUITION: logits that are neither very large nor very small insinuate
    that the classification is not 
    """
    def __init__(self, lower, upper): 
        self.lower = lower 
        self.upper = upper
        
    def __call__(self, x): 
        return ((x < self.lower) | (x > self.upper)).float()
        

class Identity(oracle): 
    def __call__(self, x): 
        return ch.ones(x.size())
    
def gen_data(): 
    """
    Generate dataset for truncated multinomial logistic 
    regression model. Returns ground_truth and train, validation, and test loaders.
    """
    # distributions
    gumbel = Gumbel(0, 1)
    U = Uniform(args.lower, args.upper) # distribution to generate ground-truth parameters
    U_ = Uniform(-5, 5) # distribution to generate samples
    
    # no grad required for dataset
    with ch.no_grad():
        # generate synthetic data until survival probability of more than 40%
        alpha = None
        while alpha is None or alpha < args.ALPHA_THRESH:
            # generate ground-truth from uniform distribution
            ground_truth = nn.Linear(in_features=args.IN_FEATURES, out_features=args.K, bias=args.bias)
            ground_truth.weight = nn.Parameter(U.sample(ch.Size([args.K, args.IN_FEATURES])))
            if ground_truth.bias is not None: 
                ground_truth.bias = nn.Parameter(U.sample(ch.Size([args.K,])))
            # independent variable 
            X = U_.sample(ch.Size([args.samples, args.IN_FEATURES]))
            # determine base model logits 
            z = ground_truth(X)
            # add noise to the logits
            noised = z + gumbel.sample(z.size())
            # apply softmax to unnormalized likelihoods
            y = ch.argmax(noised, dim=1)

            # TRUNCATE
            trunc = args.phi(z)
            indices = ch.all(trunc.bool(), dim=1).float().nonzero(as_tuple=False).flatten()
            x_trunc, y_trunc = X[indices], y[indices]
            alpha = x_trunc.size(0) / X.size(0)

            # all synthetic data 
            ds = TensorDataset(x_trunc, y_trunc)
            # split ds into training and validation data sets - 80% training, 20% validation
            train_length = int(len(ds)*.8)
            val_length = len(ds) - train_length
            train_ds, val_ds = ch.utils.data.random_split(ds, [train_length, val_length])
            # train and validation loaders
            train_loader = DataLoader(train_ds, num_workers=args.num_workers, batch_size=args.batch_size)
            val_loader = DataLoader(val_ds, num_workers=args.num_workers, batch_size=args.batch_size)

            # test dataset
            x_test = X[~indices]
            y_test = y[~indices]
            test_ds = TensorDataset(x_test, y_test)
            test_loader = DataLoader(test_ds, num_workers=args.num_workers, batch_size=args.batch_size)
            
    return ground_truth, (train_loader, val_loader), test_loader

def plot():
    # TRUNCATED CE LOSS DATA
    trunc_reader = CollectionReader(TRUNCATED_STORE_PATH)
    trunc_logs = trunc_reader.df(consts.LOGS_TABLE)
    trunc_reader.close() # close reader

    # STANDARD CE LOSS DATA
    standard_reader = CollectionReader(STANDARD_STORE_PATH)
    standard_logs = standard_reader.df(consts.LOGS_TABLE)
    standard_reader.close() # close reader

    # TEST SET RESULTS 
    trunc_test_reader = CollectionReader(TRUNCATED_EVAL_STORE_PATH)
    trunc_test_results = trunc_test_reader.df(consts.EVAL_LOGS_TABLE)
    trunc_test_reader.close() # close reader

    # TEST SET RESULTS 
    standard_test_reader = CollectionReader(STANDARD_EVAL_STORE_PATH)
    standard_test_results = standard_test_reader.df(consts.EVAL_LOGS_TABLE)
    standard_test_reader.close() # close reader

    sns.lineplot(data=trunc_logs, x='epoch', y='train_loss', label='Train Loss')
    sns.lineplot(data=standard_logs, x='epoch', y='train_loss', label='Naive Train Loss')
    sns.lineplot(data=trunc_logs, x='epoch', y='val_loss', color='red', label='Trunc Val Loss')
    ax = sns.lineplot(data=standard_logs, x='epoch', y='val_loss', color='red', label='Naive Val Loss')
    ax.set(xlabel='epoch', ylabel='CE Loss')
    plt.show()

    sns.lineplot(data=trunc_logs, x='epoch', y='train_prec1', label='Trunc Train Acc')
    sns.lineplot(data=standard_logs, x='epoch', y='train_prec1', label='Naive Train Acc')
    sns.lineplot(data=trunc_logs, x='epoch', y='val_prec1', label='Trunc Val Acc')
    ax = sns.lineplot(data=standard_logs, x='epoch', y='val_prec1', label='Naive Val Acc')
    ax.set(xlabel='epoch', ylabel='Accuracy')
    plt.show()

    print("Standard Test Accuracy: {}".format(standard_test_results['test_prec1']))
    print("Truncated Test Accuracy: {}".format(trunc_test_results['test_prec1']))

# CE Latent Variable Model Loss

In [21]:
class GumbelCE(ch.autograd.Function):
    @staticmethod
    def forward(ctx, pred, targ):
        ctx.save_for_backward(pred, targ)
        ce_loss = ch.nn.CrossEntropyLoss()
        return ce_loss(pred, targ)

    @staticmethod
    def backward(ctx, grad_output):
        pred, targ = ctx.saved_tensors
        # gumbel distribution
        gumbel = Gumbel(0, 1)
        # make num_samples copies of pred logits
        stacked = pred[None, ...].repeat(args.num_samples, 1, 1)        
        # add gumbel noise to logits
        rand_noise = gumbel.sample(stacked.size())
        noised = stacked + rand_noise 
        noised_labs = noised.argmax(-1)
        # remove the logits from the trials, where the kth logit is not the largest value
        good_mask = noised_labs.eq(targ)[..., None]
        inner_exp = 1 - ch.exp(-rand_noise)
        avg = (inner_exp * good_mask).sum(0) / (good_mask.sum(0) + 1e-5) / pred.size(0)
        return -avg , None
    
class TruncatedGumbelCE(ch.autograd.Function):
    @staticmethod
    def forward(ctx, pred, targ):
        ctx.save_for_backward(pred, targ)
        ce_loss = ch.nn.CrossEntropyLoss()
        return ce_loss(pred, targ)

    @staticmethod
    def backward(ctx, grad_output):
        pred, targ = ctx.saved_tensors
        # initialize gumbel distribution
        gumbel = Gumbel(0, 1)
        # make num_samples copies of pred logits
        stacked = pred[None, ...].repeat(args.num_samples, 1, 1)   
        # add gumbel noise to logits
        rand_noise = gumbel.sample(stacked.size())
        noised = stacked + rand_noise 
        # truncate - if one of the noisy logits does not fall within the truncation set, remove it
        filtered = ch.all(args.phi(noised).bool(), dim=2).float().unsqueeze(2)
        noised_labs = noised.argmax(-1)
        # mask takes care of invalid logits and truncation set
        mask = noised_labs.eq(targ)[..., None] * filtered
        inner_exp = 1 - ch.exp(-rand_noise)

        avg = ((inner_exp * mask).sum(0) / (mask.sum(0) + 1e-5) - (inner_exp * filtered).sum(0) / (filtered.sum(0) + 1e-5)) 
        return -avg / pred.size(0), None, None
    
# gradients
gumbel_ce = GumbelCE.apply
trunc_ce = TruncatedGumbelCE.apply

# Default Experiment Parameters

In [5]:
# procedure hyperparameters
args = Parameters({ 
    'epochs': 25,
    'num_workers': 0, 
    'batch_size': 100,
    'bias': True,
    'num_samples': 1000,
    'clamp': True, 
    'radius': 5.0, 
    'lr': 1e-2,
    'shuffle': False, 
    'samples': 10000,  # number of samples to generate for ground truth
    'in_features': 2, # number of in-features to multi-log-reg
    'k': 2, # number of classes
    'lower': -1, # lower bound for generating ground truth weights
    'upper': 1,  # upper bound for generating ground truth weights
    'trials': 1,
    'log_iters': 1,    
    'should_save_ckpt': True,
    'save_ckpt_iters': -1,
    'validation_split': .8,
    'momentum': 0.0,
    'weight_decay': 0.0,
    'custom_lr_multiplier': consts.COSINE, 
    'shuffle': True,
    'device': 'cpu',
    'alpha_thresh': .2,
})

# if ch.cuda.is_available(): 
#     args.__setattr__('device', 'cuda:1')
# else: 
#     args.__setattr__('device', 'cpu')
# # set default device to device
# ch.cuda.set_device(args.device)
# args

# Truncated Multinomial Logistic Regression Experiments

In [6]:
# phi = DNN_Lower(ch.full(ch.Size([args.K,]), -2, dtype=ch.float32))
# phi = DNN_Lower(Tensor([-2, -3, -2, -3, -4, -5, -6, -7, -6, -5]))
# phi = Identity()
phi = DNN_Logit_Ball(ch.full(ch.Size([args.K,]), -2, dtype=ch.float32), ch.full(ch.Size([args.K,]), 2, dtype=ch.float32))
args.__setattr__('phi', phi)

# Experiment

In [7]:
# perform number of trials experiments
for i in range(args.trials):
    # generate data for exp
    ground_truth, loaders, test_loader = gen_data()

    # new classifier models at the beginning of each trial
    trunc_multi_log_reg = nn.Linear(in_features=args.IN_FEATURES, out_features=args.K, bias=args.bias)

    # truncated store
    out_store = Store(TRUNCATED_STORE_PATH)
    args.__setattr__('custom_criterion', trunc_ce)  # truncated ce loss
    train.train_model(args, trunc_multi_log_reg, loaders, store=out_store, device=args.device)

    # new classifier models at the beginning of each trial
    standard_multi_log_reg = nn.Linear(in_features=args.IN_FEATURES, out_features=args.K, bias=args.bias)

    # naive ce loss
    out_store = Store(STANDARD_STORE_PATH)
    args.__setattr__('custom_criterion', None) # default ce loss
    train.train_model(args, standard_multi_log_reg, loaders, store=out_store, device=args.device)

    # truncated multinomial logistic regression eval
    out_store = Store(TRUNCATED_EVAL_STORE_PATH)
    train.eval_model(args, trunc_multi_log_reg, test_loader, out_store)

    # standard multinomial logistic regression eval - if there is a test set
    if not isinstance(phi, Identity):
        out_store = Store(STANDARD_EVAL_STORE_PATH)
        train.eval_model(args, standard_multi_log_reg, test_loader, out_store)

Logging in: /Users/patroklos/MultinomialLogisticRegressionTruncated/5d5a2571-ddee-42da-859e-2247a0cf1e97


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.3475,  0.8378],
        [-0.3947, -1.6196]])
bias grad: tensor([0.0054, 0.3095])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.3475,  0.8378],
        [-0.3947, -1.6196]])
bias grad: tensor([0.0054, 0.3095])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.2052,  0.4978],
        [-0.1496, -0.4699]])
bias grad: tensor([0.1076, 0.0975])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.2052,  0.4978],
        [-0.1496, -0.4699]])
bias grad: tensor([0.1076, 0.0975])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.1427,  0.3148],
        [-0.2051, -0.4242]])
bias grad: tensor([0.0444, 0.0981])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.1427,  0.3148],
        [-0.2051, -0.4242]])
bias grad: tensor([0.0444, 0.0981])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0873,  0.2062],
        [-0.2093, -0.3477]])
bias grad: tensor([0.0337, 0.0768])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0873,  0.2062],
        [-0.2093, -0.3477]])
bias grad: tensor([0.0337, 0.0768])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0409,  0.0845],
        [-0.1416, -0.1790]])
bias grad: tensor([0.0102, 0.0448])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0409,  0.0845],
        [-0.1416, -0.1790]])
bias grad: tensor([0.0102, 0.0448])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0359,  0.0655],
        [-0.1395, -0.1457]])
bias grad: tensor([0.0070, 0.0389])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0359,  0.0655],
        [-0.1395, -0.1457]])
bias grad: tensor([0.0070, 0.0389])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0225,  0.0612],
        [-0.0784, -0.0854]])
bias grad: tensor([0.0136, 0.0216])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0225,  0.0612],
        [-0.0784, -0.0854]])
bias grad: tensor([0.0136, 0.0216])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0299,  0.0546],
        [-0.0642, -0.0719]])
bias grad: tensor([0.0074, 0.0172])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0299,  0.0546],
        [-0.0642, -0.0719]])
bias grad: tensor([0.0074, 0.0172])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0139,  0.0393],
        [-0.0467, -0.0656]])
bias grad: tensor([0.0097, 0.0136])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0139,  0.0393],
        [-0.0467, -0.0656]])
bias grad: tensor([0.0097, 0.0136])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0120,  0.0367],
        [-0.0329, -0.0420]])
bias grad: tensor([0.0088, 0.0091])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0120,  0.0367],
        [-0.0329, -0.0420]])
bias grad: tensor([0.0088, 0.0091])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0191,  0.0378],
        [-0.0329, -0.0487]])
bias grad: tensor([0.0049, 0.0104])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0191,  0.0378],
        [-0.0329, -0.0487]])
bias grad: tensor([0.0049, 0.0104])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0117,  0.0257],
        [-0.0315, -0.0414]])
bias grad: tensor([0.0040, 0.0097])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0117,  0.0257],
        [-0.0315, -0.0414]])
bias grad: tensor([0.0040, 0.0097])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0144,  0.0310],
        [-0.0226, -0.0312]])
bias grad: tensor([0.0056, 0.0062])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0144,  0.0310],
        [-0.0226, -0.0312]])
bias grad: tensor([0.0056, 0.0062])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0118,  0.0264],
        [-0.0166, -0.0222]])
bias grad: tensor([0.0048, 0.0046])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0118,  0.0264],
        [-0.0166, -0.0222]])
bias grad: tensor([0.0048, 0.0046])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0187,  0.0324],
        [-0.0314, -0.0431]])
bias grad: tensor([0.0028, 0.0101])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0187,  0.0324],
        [-0.0314, -0.0431]])
bias grad: tensor([0.0028, 0.0101])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0064,  0.0228],
        [-0.0236, -0.0282]])
bias grad: tensor([0.0073, 0.0054])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0064,  0.0228],
        [-0.0236, -0.0282]])
bias grad: tensor([0.0073, 0.0054])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0119,  0.0246],
        [-0.0318, -0.0406]])
bias grad: tensor([0.0043, 0.0096])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0119,  0.0246],
        [-0.0318, -0.0406]])
bias grad: tensor([0.0043, 0.0096])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0064,  0.0201],
        [-0.0136, -0.0195]])
bias grad: tensor([0.0055, 0.0029])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0064,  0.0201],
        [-0.0136, -0.0195]])
bias grad: tensor([0.0055, 0.0029])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0060,  0.0236],
        [-0.0132, -0.0246]])
bias grad: tensor([0.0077, 0.0035])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0060,  0.0236],
        [-0.0132, -0.0246]])
bias grad: tensor([0.0077, 0.0035])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0200,  0.0352],
        [-0.0263, -0.0315]])
bias grad: tensor([0.0038, 0.0067])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0200,  0.0352],
        [-0.0263, -0.0315]])
bias grad: tensor([0.0038, 0.0067])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0083,  0.0178],
        [-0.0179, -0.0249]])
bias grad: tensor([0.0031, 0.0050])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0083,  0.0178],
        [-0.0179, -0.0249]])
bias grad: tensor([0.0031, 0.0050])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0118,  0.0228],
        [-0.0162, -0.0255]])
bias grad: tensor([0.0024, 0.0051])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0118,  0.0228],
        [-0.0162, -0.0255]])
bias grad: tensor([0.0024, 0.0051])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0095,  0.0206],
        [-0.0174, -0.0266]])
bias grad: tensor([0.0032, 0.0044])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0095,  0.0206],
        [-0.0174, -0.0266]])
bias grad: tensor([0.0032, 0.0044])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0094,  0.0228],
        [-0.0182, -0.0260]])
bias grad: tensor([0.0033, 0.0057])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0094,  0.0228],
        [-0.0182, -0.0260]])
bias grad: tensor([0.0033, 0.0057])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0174,  0.0349],
        [-0.0246, -0.0300]])
bias grad: tensor([0.0048, 0.0068])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0174,  0.0349],
        [-0.0246, -0.0300]])
bias grad: tensor([0.0048, 0.0068])
avg loss: 0.014636502982402532
avg top 1: 99.78457641601562
Logging in: /Users/patroklos/MultinomialLogisticRegressionStandard/5062b81e-7253-4bd5-88ec-e1da3870fd3c


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.2210,  0.4062],
        [-0.2210, -0.4062]])
bias grad: tensor([-0.0254,  0.0254])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.2210,  0.4062],
        [-0.2210, -0.4062]])
bias grad: tensor([-0.0254,  0.0254])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.1212,  0.2302],
        [-0.1212, -0.2302]])
bias grad: tensor([-0.0128,  0.0128])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.1212,  0.2302],
        [-0.1212, -0.2302]])
bias grad: tensor([-0.0128,  0.0128])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0835,  0.1606],
        [-0.0835, -0.1606]])
bias grad: tensor([-0.0077,  0.0077])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0835,  0.1606],
        [-0.0835, -0.1606]])
bias grad: tensor([-0.0077,  0.0077])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0640,  0.1239],
        [-0.0640, -0.1239]])
bias grad: tensor([-0.0052,  0.0052])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0640,  0.1239],
        [-0.0640, -0.1239]])
bias grad: tensor([-0.0052,  0.0052])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0523,  0.1015],
        [-0.0523, -0.1015]])
bias grad: tensor([-0.0037,  0.0037])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0523,  0.1015],
        [-0.0523, -0.1015]])
bias grad: tensor([-0.0037,  0.0037])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0445,  0.0865],
        [-0.0445, -0.0865]])
bias grad: tensor([-0.0027,  0.0027])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0445,  0.0865],
        [-0.0445, -0.0865]])
bias grad: tensor([-0.0027,  0.0027])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0390,  0.0759],
        [-0.0390, -0.0759]])
bias grad: tensor([-0.0021,  0.0021])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0390,  0.0759],
        [-0.0390, -0.0759]])
bias grad: tensor([-0.0021,  0.0021])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0350,  0.0681],
        [-0.0350, -0.0681]])
bias grad: tensor([-0.0016,  0.0016])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0350,  0.0681],
        [-0.0350, -0.0681]])
bias grad: tensor([-0.0016,  0.0016])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0319,  0.0621],
        [-0.0319, -0.0621]])
bias grad: tensor([-0.0013,  0.0013])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0319,  0.0621],
        [-0.0319, -0.0621]])
bias grad: tensor([-0.0013,  0.0013])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0295,  0.0575],
        [-0.0295, -0.0575]])
bias grad: tensor([-0.0010,  0.0010])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0295,  0.0575],
        [-0.0295, -0.0575]])
bias grad: tensor([-0.0010,  0.0010])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0277,  0.0538],
        [-0.0277, -0.0538]])
bias grad: tensor([-0.0008,  0.0008])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0277,  0.0538],
        [-0.0277, -0.0538]])
bias grad: tensor([-0.0008,  0.0008])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0262,  0.0509],
        [-0.0262, -0.0509]])
bias grad: tensor([-0.0007,  0.0007])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0262,  0.0509],
        [-0.0262, -0.0509]])
bias grad: tensor([-0.0007,  0.0007])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0250,  0.0486],
        [-0.0250, -0.0486]])
bias grad: tensor([-0.0006,  0.0006])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0250,  0.0486],
        [-0.0250, -0.0486]])
bias grad: tensor([-0.0006,  0.0006])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0240,  0.0467],
        [-0.0240, -0.0467]])
bias grad: tensor([-0.0005,  0.0005])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0240,  0.0467],
        [-0.0240, -0.0467]])
bias grad: tensor([-0.0005,  0.0005])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0232,  0.0451],
        [-0.0232, -0.0451]])
bias grad: tensor([-0.0004,  0.0004])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0232,  0.0451],
        [-0.0232, -0.0451]])
bias grad: tensor([-0.0004,  0.0004])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0226,  0.0439],
        [-0.0226, -0.0439]])
bias grad: tensor([-0.0004,  0.0004])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0226,  0.0439],
        [-0.0226, -0.0439]])
bias grad: tensor([-0.0004,  0.0004])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0221,  0.0430],
        [-0.0221, -0.0430]])
bias grad: tensor([-0.0003,  0.0003])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0221,  0.0430],
        [-0.0221, -0.0430]])
bias grad: tensor([-0.0003,  0.0003])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0217,  0.0422],
        [-0.0217, -0.0422]])
bias grad: tensor([-0.0003,  0.0003])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0217,  0.0422],
        [-0.0217, -0.0422]])
bias grad: tensor([-0.0003,  0.0003])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0214,  0.0417],
        [-0.0214, -0.0417]])
bias grad: tensor([-0.0003,  0.0003])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0214,  0.0417],
        [-0.0214, -0.0417]])
bias grad: tensor([-0.0003,  0.0003])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0212,  0.0413],
        [-0.0212, -0.0413]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0212,  0.0413],
        [-0.0212, -0.0413]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0211,  0.0410],
        [-0.0211, -0.0410]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0211,  0.0410],
        [-0.0211, -0.0410]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0210,  0.0408],
        [-0.0210, -0.0408]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0210,  0.0408],
        [-0.0210, -0.0408]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0209,  0.0407],
        [-0.0209, -0.0407]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0209,  0.0407],
        [-0.0209, -0.0407]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0209,  0.0407],
        [-0.0209, -0.0407]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0209,  0.0407],
        [-0.0209, -0.0407]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/24 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0209,  0.0406],
        [-0.0209, -0.0406]])
bias grad: tensor([-0.0002,  0.0002])


  0%|          | 0/6 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0209,  0.0406],
        [-0.0209, -0.0406]])
bias grad: tensor([-0.0002,  0.0002])
avg loss: 0.019688974247377835
avg top 1: 99.78457641601562
Logging in: /Users/patroklos/MultinomialLogisticRegressionTruncatedTest/263853a3-9ef9-4d55-b970-63bfe3154363


  0%|          | 0/30 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0174,  0.0349],
        [-0.0246, -0.0300]])
bias grad: tensor([0.0048, 0.0068])
Logging in: /Users/patroklos/MultinomialLogisticRegressionStandardTest/8f8a7855-a058-453c-9e76-24b0b35421f9


  0%|          | 0/30 [00:00<?, ?it/s]

weight grad: tensor([[ 0.0209,  0.0406],
        [-0.0209, -0.0406]])
bias grad: tensor([-0.0002,  0.0002])


In [None]:
# plot results
plot()

# Cosine Similarity and L2 Distance

In [11]:
ch.nn.functional.cosine_similarity(trunc_multi_log_reg.weight, ground_truth.weight)

tensor([0.7753, 0.5477], grad_fn=<DivBackward0>)

In [13]:
ch.nn.functional.cosine_similarity(standard_multi_log_reg.weight, ground_truth.weight)

tensor([0.9117, 0.9972], grad_fn=<DivBackward0>)

In [19]:
ch.nn.functional.cosine_similarity(trunc_multi_log_reg.bias.unsqueeze(0), ground_truth.bias.unsqueeze(0))

tensor([0.9778], grad_fn=<DivBackward0>)

In [20]:
ch.nn.functional.cosine_similarity(standard_multi_log_reg.bias.unsqueeze(0), ground_truth.bias.unsqueeze(0))

tensor([0.9950], grad_fn=<DivBackward0>)