In [1]:
%load_ext autoreload
%autoreload 2

import copy
import math
import random
import time
from collections import OrderedDict, defaultdict
from typing import Union, List
from utils import *

import numpy as np
import torch
import torch.nn as nn
from torch.optim import *
from torch.utils.data import DataLoader
from torchprofile import profile_macs
from tqdm.auto import tqdm

assert torch.cuda.is_available(), \
"CUDA support is not available."

import pickle

import LiveTune as lt
import timm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [3]:
# Set dataset
dataloader = get_dataloader("imagenet", 256, model_name="not mixer")

In [4]:
base_mixer = timm.create_model("mixer_b16_224.goog_in21k_ft_in1k", pretrained=True).to(device)

In [6]:
evaluate_model(base_mixer, dataloader=dataloader, device=device)



model has test accuracy=76.47%
model has top5 accuracy=0.92%
model has size=228.43 MiB
model has macs=12.61 Gmacs
average inference time is 0.0042 seconds
model has 59.88 M parameters


In [7]:
collapsible_mixer = get_collapsible_model(base_mixer, fraction=.1, device=device)
collapsible_mixer.load_state_dict(torch.load("./models_archive/mixer/base/new_finetuned_5epoch_frac1_lc2.pth", map_location=device))
collapse_model(collapsible_mixer, fraction=.1, device=device)

Collapsing layer blocks.11.mlp_channels
Collapsing layer blocks.11.mlp_tokens
Collapsing layer blocks.11.mlp_channels
Collapsing layer blocks.11.mlp_tokens


In [8]:
evaluate_model(collapsible_mixer, dataloader=dataloader, device=device)



model has test accuracy=76.11%
model has top5 accuracy=0.92%
model has size=212.24 MiB
model has macs=11.71 Gmacs
average inference time is 0.0040 seconds
model has 55.64 M parameters


In [5]:
collapsible_mixer.load_state_dict(torch.load("./models_archive/mixer/base/finetuned_46epoch_frac02_lc.pth", map_location=device))

<All keys matched successfully>

In [13]:
evaluate(collapsible_mixer, dataloader=dataloader['val'], device=device)

                                                       

74.20399475097656

In [6]:
get_model_collapsible_slopes(collapsible_mixer)

blocks.11.mlp_channels 0.9999966025352478
blocks.11.mlp_tokens 0.999999463558197
blocks.10.mlp_channels 0.9998729228973389
blocks.10.mlp_tokens 0.9999915361404419


In [7]:
collapse_model(collapsible_mixer, fraction=.2, device=device, threshold=0.01)

Collapsing layer blocks.11.mlp_channels
Collapsing layer blocks.11.mlp_tokens
Collapsing layer blocks.10.mlp_channels
Collapsing layer blocks.10.mlp_tokens


In [8]:
evaluate(collapsible_mixer, dataloader=dataloader['val'], device=device)

eval:   0%|          | 0/196 [00:00<?, ?it/s]                                                       

74.04800415039062

In [9]:
get_num_parameters(base_mixer)

59880472

In [10]:
get_num_parameters(collapsible_mixer)

51391800

In [10]:
evaluate_model(collapsible_mixer, dataloader=dataloader, device=device)



model has test accuracy=74.05%
model has top5 accuracy=0.91%
model has size=196.04 MiB
model has macs=10.81 Gmacs
average inference time is 0.0057 seconds
model has 51.39 M parameters


In [11]:
evaluate_model(base_mixer, dataloader=dataloader, device=device)

                                                      

KeyboardInterrupt: 

# Large Depricated!

In [4]:
base_mixer = timm.create_model("mixer_l16_224.goog_in21k_ft_in1k", pretrained=True).to(device)

In [5]:
evaluate(base_mixer, dataloader=dataloader['val'], device=device)

eval:   0%|          | 0/196 [00:00<?, ?it/s]                                                       

68.33999633789062

# Getting the loaded data from mixer_async_runner.py
 Located at models_archive

In [9]:
collapsible_mixer_l16_224 = torch.load("./models_archive/mixer/large/collapsible_mixer_l16_224_5epoch_collapse0.01_frac0.5_lr0.0005.pth", map_location=device)

In [11]:
collapsible_mixer_l16_224.to(device)

MlpMixer(
  (stem): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (blocks): Sequential(
    (0): MixerBlock(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp_tokens): CollapsibleMlp(
        (fc1): Linear(in_features=196, out_features=384, bias=True)
        (act): PReLU(num_parameters=1)
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=384, out_features=196, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp_channels): CollapsibleMlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): PReLU(num_parameters=1)
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop2): Dropout(p=0.0, i

In [13]:
base_mixer

MlpMixer(
  (stem): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (blocks): Sequential(
    (0): MixerBlock(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp_tokens): Mlp(
        (fc1): Linear(in_features=196, out_features=512, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=512, out_features=196, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp_channels): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
 

In [None]:
evaluate(base_mixer_b16_224, dataloader=dataloader['val'], device=device)

                                                     

76.4739990234375

In [10]:
evaluate(collapsible_mixer_l16_224, dataloader=dataloader['val'], device=device)

eval:   0%|          | 0/196 [00:00<?, ?it/s]                                                       

73.59199523925781

# get all collapsible layers slopes

In [13]:
def get_model_collapsible_slopes(model, fraction=1.0):
    num_mlp_layers = len(list(model.named_modules()))
    for name, module in list(model.named_modules())[::-1][:int(num_mlp_layers * fraction)]:
        if isinstance(module, CollapsibleMlp):
            print(name, module.act.weight.item())

In [14]:
get_model_collapsible_slopes(collapsible_mixer_l16_224, fraction=.5)

blocks.11.mlp_channels 0.0018727623391896486
blocks.11.mlp_tokens 0.6893177032470703
blocks.10.mlp_channels -0.06573034822940826
blocks.10.mlp_tokens 0.7889662981033325
blocks.9.mlp_channels -0.09059245884418488
blocks.9.mlp_tokens 0.7854459881782532
blocks.8.mlp_channels -0.10836826264858246
blocks.8.mlp_tokens 0.7143239378929138
blocks.7.mlp_channels -0.11396287381649017
blocks.7.mlp_tokens 0.6591113805770874
blocks.6.mlp_channels -0.08409617841243744
blocks.6.mlp_tokens 0.3677079677581787


In [6]:
finetune(collapsible_mixer_l16_224, lc=0, fraction=0, lr=0.0005, dataloader=dataloader, epochs=1, device=device)

5005it [1:47:42,  1.29s/it]<?, ?it/s]
100%|██████████| 1/1 [1:47:43<00:00, 6463.10s/it]

Epoch: 0, Train Loss: 0.8646, Train Acc: 0.8231, Train top5: 0.9101





In [8]:
# save model in ./models_achive
torch.save(collapsible_mixer_l16_224.state_dict(), "./models_archive/" + "collapsible_mixer_l16_224_prelu_1epoch" + ".pth")

In [None]:
evaluate(collapsible_mixer_l16_224, dataloader=dataloader['val'], device=device)

NameError: name 'evaluate' is not defined

In [12]:
finetune(collapsible_mixer_l16_224, lc=0, fraction=0, lr=0.0005, dataloader=dataloader, epochs=1, device=device)

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
evaluate(collapsible_mixer_l16_224, dataloader=dataloader['val'], device=device)

In [5]:
evaluate_model(base_mixer_l16_224, dataloader, count_nonzero_only=False, device=device)

                                            

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same