# Model Output

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torchvision.models import resnet18

In [None]:
model = resnet18().eval()

In [None]:
list(model.modules())[-1]

In [None]:
out = model(torch.randn(1, 3, 32, 32))

In [None]:
out.shape

In [None]:
np.max(out.detach().numpy()), np.min(out.detach().numpy())

In [None]:
plt.figure(figsize=(30, 10))
plt.bar(range(1000), out.detach().numpy().reshape(-1))
plt.show()

# Configuration

In [None]:
import json
from KD_Lib.models.resnet import ResNet18, ResNet50, ResNet152

In [None]:
class Cfg:
    def __init__(self, dict=None):
        if dict is not None:
            for key in dict:
                setattr(self, key, dict[key])
            return
        
        self.MODE: str = 'shake' # 'kd' or 'dml' or 'shake'
        self.DATASET: str = 'cifar100' # 'cifar10' or 'cifar100'
        self.CLASSES: int = 100
        self.DATA_PATH: str = '../Knowledge-Distillation-Zoo/datasets/'
        self.BATCH_SIZE: int = 128
        self.TEACHER = 'resnet152' 
        self.STUDENT = 'resnet18'
        self.LR: float = 0.1
        self.LR_MIN: float = 1e-6 #1e-5
        self.T: float = 1.0
        self.W: float = 0.5
        self.EPOCHS: int = 200
        self.SCHEDULER: str = 'cos' # 'cos' or 'step'
        self.TEACHER_WEIGHTS: str = f'./models/teacher_{self.DATASET}_{self.MODE}.pt'
        self.PARALLEL: bool = False
        self.EXP: str = f"{self.MODE}_{self.DATASET}"

In [None]:
cfg = Cfg()
cfg.__dict__

In [None]:
with open("cfg.json", "w") as file:
    json.dump(cfg.__dict__, file)
 
with open("cfg.json", "r") as file:
    loaded_cfg = json.load(file)
 
print(loaded_cfg)

In [None]:
cfg = Cfg(loaded_cfg)

In [None]:
cfg.__dict__

# Visualize Scheduler

In [None]:
import torch
from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, MultiStepLR
import matplotlib.pyplot as plt

In [None]:
STEPS = 200
LR = 0.1
ETA = 1e-5

lrs = []
optimizer = torch.optim.SGD([torch.tensor(1)], lr=LR)
scheduler = CosineAnnealingLR(optimizer, STEPS, eta_min=ETA, last_epoch=-1)
for _ in range(STEPS):
    optimizer.step()
    lrs.append(scheduler.get_last_lr())
    scheduler.step()
plt.plot(lrs, label=scheduler.__class__.__name__)

lrs = []
optimizer = torch.optim.SGD([torch.tensor(1)], lr=LR)
scheduler = LinearLR(optimizer, total_iters=STEPS, start_factor=1, end_factor=ETA/LR)
for _ in range(STEPS):
    optimizer.step()
    lrs.append(scheduler.get_last_lr())
    scheduler.step()
plt.plot(lrs, label=scheduler.__class__.__name__)

lrs = []
optimizer = torch.optim.SGD([torch.tensor(1)], lr=LR)
scheduler = MultiStepLR(optimizer, [60, 120, 180], gamma=0.1)
for _ in range(STEPS):
    optimizer.step()
    lrs.append(scheduler.get_last_lr())
    scheduler.step()
plt.plot(lrs, label=scheduler.__class__.__name__)

#plt.semilogy()
plt.legend()
plt.show()

## CUB200

In [None]:
import torch
import numpy as np
from torchvision import datasets, transforms
from KD_Lib.datasets import Cub200

DATASET = 'cub200'
DATA_PATH = '../Knowledge-Distillation-Zoo/datasets/'
BATCH_SIZE = 128

In [None]:
if DATASET == 'cifar100':
    dataset = datasets.CIFAR100
    mean = (0.5071, 0.4865, 0.4409)
    std  = (0.2673, 0.2564, 0.2762)
    imsize = 32
elif DATASET == 'cub200':
    dataset = Cub200
    mean = (104/255.0, 117/255.0, 128/255.0)
    std = (1/255.0, 1/255.0, 1/255.0)
    imsize = 227

train_transform = transforms.Compose([
    transforms.RandomCrop(imsize, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
    ])
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
    ])

trainset = dataset(root=DATA_PATH, train=True, download=False, transform=train_transform)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)

In [None]:
trainset[0][0].shape

In [None]:
trainset[0][0].shape

In [None]:
mean

## ResNet

In [None]:
from torchvision.models import resnet18 as ResNet18, resnet50 as ResNet50, resnet152 as ResNet152

def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

In [None]:
model = ResNet18(weights=None, num_classes=100)

In [None]:
get_n_params(model)

In [None]:
from KD_Lib.models.resnet import ResNet18, ResNet50, ResNet152

In [None]:
model_new = ResNet18(num_classes=100)

In [None]:
len(list(model_new.parameters()))

In [None]:
get_n_params(model_new)

In [None]:
model

In [None]:
model_new

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from KD_Lib.models.resnet_torch import get_ResNet

In [None]:
model = get_ResNet('resnet18', 10).eval()

In [None]:
test = torch.randn(1, 3, 32, 32)

In [None]:
model.forward(test, norm_feats=False)

In [None]:
np.hist(model.forward(test, norm_feats=True))

In [None]:
out, feats, weight, bias = model.forward(torch.randn(2, 3, 32, 32), return_feats=True)

# Track Experiments

In [None]:
import os
import json
import pandas as pd

In [None]:
DIR = './exp/'

experiments = []
for dirname, _, filenames in os.walk(DIR):
    for filename in filenames:
        experiments.append(json.load(open(os.path.join(dirname, filename), 'r')))

df = pd.DataFrame(experiments)
T = df[df['EXP'] == 'kd_cifar100_new'].iloc[0]['TIME']


In [None]:
T

In [None]:
# CRD
T_KD = 109.26
T_CRD = 156.58
T_CRD / T_KD

In [None]:
df['T_LAST'] = df['VACC'].apply(lambda x: x['T_LAST'])
df['T_BEST'] = df['VACC'].apply(lambda x: x['T_BEST'])
df['S_LAST'] = df['VACC'].apply(lambda x: x['S_LAST'])
df['S_BEST'] = df['VACC'].apply(lambda x: x['S_BEST'])
df['TIME'] = df['TIME'].apply(lambda x: x/T)
df[['EXP', 'T', 'W', 'FEAT_NORM', 'T_LAST', 'T_BEST', 'S_LAST', 'S_BEST', 'TIME']].sort_values(by='S_BEST', ascending=False)

# Model Parameters

In [1]:
import torch
from KD_Lib.models import model_dict
from KD_Lib.models.resnet_torch import monkey_patch
from KD_Lib.models.resnet import BasicBlock

In [2]:
teacher = model_dict['resnet110'](num_classes=100)
student = monkey_patch(teacher, custom=True)
pytorch_total_params = sum(p.numel() for p in teacher.parameters())
print(f"Teacher: {pytorch_total_params:.1e} params")
teacher.load_state_dict(torch.load('./models/resnet110_cifar100.pt'))

student = model_dict['resnet20'](num_classes=100)
student = monkey_patch(student, custom=True)
pytorch_total_params = sum(p.numel() for p in student.parameters())
print(f"Student: {pytorch_total_params:.1e} params")

Teacher: 1.7e+06 params
Student: 2.8e+05 params


In [3]:
len(teacher.layer1), len(student.layer1) # 18/3 per layer -> copy 1/6 of the blocks
len(teacher.layer2), len(student.layer2) # 18/3 per layer -> copy 1/6 of the blocks
len(teacher.layer3), len(student.layer3) # 18/3 per layer -> copy 1/6 of the blocks

(18, 3)

In [4]:
blocks = [(0,0),(1,9),(2,17)]

In [7]:
for lt, ls in zip(teacher.children(), student.children()):
    # print(type(lt), type(ls))
    if isinstance(lt, torch.nn.Sequential):
        for s, t in blocks:
            # print(type(lt[t]), type(ls[s]))
            ls[s].load_state_dict(lt[t].state_dict())
    else:
        ls.load_state_dict(lt.state_dict())

In [8]:
torch.save(student.state_dict(), './models/resnet20_cifar100.pt')

In [None]:
from KD_Lib.models.resnet_test import ResNet50
from KD_Lib.models.shake import ShakeHead

In [None]:
model = ResNet50(num_classes=1000)

In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
print(f"Parameters: {pytorch_total_params}")

In [None]:
data = torch.randn(2, 3, 224, 224)
feat_t, out_t = model(data, is_feat=True)
shake = ShakeHead(feat_t)

In [None]:
pytorch_total_params = sum(p.numel() for p in shake.parameters())
print(f"Parameters: {pytorch_total_params}")

# Distillation Loss

In [None]:
import torch

In [None]:
label = torch.randint(0, 10, (1,))
pred_t, pred_f = torch.zeros(10), torch.zeros(10)
pred_t[label] = 0.8
pred_t += 0.2 / 10
pred_f[torch.randint(0, 10, (1,))] = 0.8
pred_f += 0.2 / 10
pred_t_s = torch.softmax(pred_t/4, dim=-1)
pred_f_s = torch.softmax(pred_f/4, dim=-1)

In [None]:
pred_t, pred_f, pred_t_s, pred_f_s

In [None]:
torch.nn.functional.cross_entropy(pred_t.unsqueeze(0), label), torch.nn.functional.cross_entropy(pred_f.unsqueeze(0), label)

In [None]:
torch.nn.functional.cross_entropy(pred_t_s.unsqueeze(0), label), torch.nn.functional.cross_entropy(pred_f_s.unsqueeze(0), label)

# HP Search

In [None]:
import os, joblib
import optuna
import torch

In [None]:
file = None
file = './hp_search/hp_search_240126111134.pkl' # old
#file = '../ray_results/hp_search_240127091935/searcher-state-2024-01-27_09-19-39.pkl' # new

In [None]:
if file is not None:
    try:
        study = joblib.load(file)['_ot_study']
    except:
        study = joblib.load(file)
    print(f"Best trial until now ({len(study.trials)} trials):")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"   {key}: {value}")
else:
    path = './hp_search/'
    studies = [joblib.load(path+f) for f in os.listdir(path) 
            if os.path.isfile(os.path.join(path, f)) 
            and f.endswith('.pkl')]

    study = optuna.create_study(direction='maximize')
    for s in studies:
        study.add_trials(s.get_trials())

In [None]:
df = study.trials_dataframe()[['number', 'datetime_start', 'value', 'params_Lc', 'params_Ld', 'params_Le', 'params_Lf']]
df.sort_values(by='value', ascending=False)

In [None]:
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.show()

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()

In [None]:
fig = optuna.visualization.plot_intermediate_values(study)
fig.show()

# Jocor Loss

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
EPOCHS = 240
GRADUAL = 150
FORGET_RATE = 0.1

forget_scheduler = np.zeros(EPOCHS)
forget_scheduler[:GRADUAL] = np.linspace(FORGET_RATE, 0, GRADUAL)

In [None]:
plt.plot(forget_scheduler)
plt.show()

In [None]:
kld = torch.nn.KLDivLoss(reduction='batchmean')
kld_test = torch.nn.KLDivLoss(reduction='none')
mse = torch.nn.MSELoss(reduction='mean')
mse_test = torch.nn.MSELoss(reduction='none')

In [None]:
t = torch.tensor([[0.1, 0.2, 0.7], [0.3, 0.2, 0.5]])
s = torch.tensor([[0.3, 0.2, 0.5], [0.1, 0.2, 0.7]])

In [None]:
loss = kld(s.log(), t)
loss_test = kld_test(s.log(), t).sum(dim=1).mean()

In [None]:
loss, loss_test

In [None]:
loss = mse(s, t)
loss_test = mse_test(s, t).mean(dim=1).mean()

In [None]:
loss, loss_test

In [None]:
import loralib

In [None]:
model = loralib.Linear(64, 100, 16)
pytorch_total_params = sum(p.numel() for p in model.parameters())
print(f"Parameters: {pytorch_total_params}")

model = torch.nn.Linear(64, 100)
pytorch_total_params = sum(p.numel() for p in model.parameters())
print(f"Parameters: {pytorch_total_params}")

In [2]:
import numpy as np

In [6]:
kd = np.array([[71.99, 72.00], [71.36, 71.72], [71.32, 71.50]])
shake = np.array([[71.15, 71.60], [71.62, 71.93], [71.18, 71.44], [71.73, 71.89]])
smooth = np.array([[71.48, 71.81], [71.90, 71.91], [71.23, 71.43], [71.19, 71.34]])

In [8]:
print(kd.mean(axis=0), shake.mean(axis=0), smooth.mean(axis=0))
print(kd.std(axis=0), shake.std(axis=0), smooth.std(axis=0))

[71.55666667 71.74      ] [71.42  71.715] [71.45   71.6225]
[0.30684777 0.20461346] [0.25816661 0.20353132] [0.28257742 0.24221633]
