[reference](https://github.com/bentrevett/pytorch-sentiment-analysis)

In [0]:
# Importing Libraries
import os
import copy
import time
import torch
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
!pwd

/content


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
path = '/content/drive/My Drive/lt-module'
os.chdir(path)

In [0]:
# Custom Libraries
import DataLoader
import Model

## Define functions - to load dataset & model

In [0]:
# load dataset
def load_dataset(dataset, architecture, batch_size, device, path):
    if dataset == "imdb":
      if architecture == "cnn":
        data = DataLoader.IMDB_CNN_CUSTOM(batch_size, device, path)
      elif architecture == "lstm":
        data = DataLoader.IMDB_LSTM(batch_size, device, path)

    elif dataset == "agnews":
        data = DataLoader.AGNEWS(batch_size, device, path)

    else:
        raise ValueError(dataset + "is not supported")

    return data

# load model and set hyperparameters
def load_model(architecture, data_choice, batch_size):

    if architecture == "cnn":
      # hyperparameters
      vocab_size = len(dataset.TEXT.vocab)
      embedding_dim = 100
      n_filters = 100
      filter_sizes = [3,4,5]
      dropout = 0.5
      pad_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.pad_token]
      
      if data_choice == "imdb":
        # binary-class
        output_dim = 1
        model = Model.binaryCNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)
      
      elif data_choice == "agnews":
        # multi-class
        output_dim = 4
        model = Model.multiCNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)

      unk_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token]
      model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
      model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

      return model

    elif architecture == "lstm":
      # hyperparameters
      vocab_size = len(dataset.TEXT.vocab)
      embedding_dim = 100
      hidden_dim = 256
      output_dim = 1
      n_layers = 2
      bidirectional = True
      dropout = 0.5
      pad_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.pad_token]
      
      if data_choice == "imdb":
        # binary-class
        output_dim = 1
      
      elif data_choice == "agnews":
        # multi-class
        output_dim = 4
      
      model = Model.LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx)

      unk_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token]
      model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
      model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

      return model

    # temporary
    elif architecture == "bert":
        return None

    else:
        raise ValueError(architecture + "is not supported")

## Weight initialization

In [0]:
# weight initializtion
def initialize_xavier_normal(m):
    
  """
	Function to initialize a layer by picking weights from a xavier normal distribution
	Arguments
	---------
	m : The layer of the neural network
	Returns
	-------
	None
	"""
  
  if type(m) == torch.nn.Conv2d:
    torch.nn.init.xavier_normal_(m.weight.data)
    m.bias.data.fill_(0)

  elif isinstance(m, torch.nn.Linear):
    torch.nn.init.xavier_normal_(m.weight.data)
    m.bias.data.fill_(0)

  elif type(m) in [torch.nn.GRU, torch.nn.LSTM, torch.nn.RNN]:
    for name, param in m.named_parameters():
        if 'weight_ih' in name:
          torch.nn.init.xavier_normal_(param.data)
        elif 'weight_hh' in name:
          torch.nn.init.orthogonal_(param.data)
        elif 'bias' in name:
          param.data.fill_(0)

## train and test functions

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    # EPS = 1e-6
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
      
      text = batch.text[0]
      target = batch.label
        
      optimizer.zero_grad()
      predictions = model(text).squeeze(1)
      loss = criterion(predictions, target)
      acc = binary_accuracy(predictions, target)
      loss.backward()
      step = 0

      # Freezing Pruned weights by making their gradients Zero
      for name, p in model.named_parameters():
        weight_dev = param.device
        tensor = p.data.cpu().numpy()
        grad_tensor = p.grad.data.cpu().numpy()
        grad_tensor = np.where(mask[step] == 0, 0, grad_tensor)
        p.grad.data = torch.from_numpy(grad_tensor).to(device)
        step += 1
      
      step = 0

      optimizer.step()
        
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
          text = batch.text[0]
          target = batch.label

          predictions = model(batch.text[0]).squeeze(1)
            
          loss = criterion(predictions, batch.label)
            
          acc = binary_accuracy(predictions, batch.label)

          epoch_loss += loss.item()
          epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Pruning Functions

In [0]:
# Prune by Percentile module
def prune_by_percentile(percent):
  global step
  global mask
  global model
  # Calculate percentile value
  step = 0
  for name, param in model.named_parameters():
    # if 'weight' in name:
    tensor = param.data.cpu().numpy()
    nz_count = np.count_nonzero(tensor)

    # bias가 all pruned 되는 경우 발생
    if nz_count == 0:
      step += 1

    else: 
      alive = tensor[np.nonzero(tensor)] # flattened array of nonzero values
      percentile_value = np.percentile(abs(alive), percent)

      # Convert Tensors to numpy and calculate
      weight_dev = param.device
      new_mask = np.where(abs(tensor) < percentile_value, 0, mask[step])
                
      # Apply new weight and mask
      param.data = torch.from_numpy(tensor * new_mask).to(weight_dev)
      mask[step] = new_mask
      step += 1

  step = 0

In [0]:
# Function to make an empty mask of the same size as the model
def make_mask(model):
  global step
  global mask
  
  step = 0
  for name, param in model.named_parameters(): 
    # if 'weight' in name:
    step += 1
  mask = [None]* step 
  step = 0
  for name, param in model.named_parameters(): 
    # if 'weight' in name:
    tensor = param.data.cpu().numpy()
    mask[step] = np.ones_like(tensor)
    step += 1
  step = 0

In [0]:
def original_initialization(mask_temp, initial_state_dict):
    global step
    global model
    
    step = 0
    for name, param in model.named_parameters(): 
        # if "weight" in name: 
            weight_dev = param.device
            param.data = torch.from_numpy(mask_temp[step] * initial_state_dict[name].cpu().numpy()).to(weight_dev)
            step = step + 1
        # if "bias" in name:
            # param.data = initial_state_dict[name]
    step = 0

In [0]:
# ANCHOR Print table of zeros and non-zeros count
def print_nonzeros(model):
    nonzero = total = 0
    for name, p in model.named_parameters():
        tensor = p.data.cpu().numpy()
        nz_count = np.count_nonzero(tensor)
        total_params = np.prod(tensor.shape)
        nonzero += nz_count
        total += total_params
        print(f'{name:20} | nonzeros = {nz_count:7} / {total_params:7} ({100 * nz_count / total_params:6.2f}%) | total_pruned = {total_params - nz_count :7} | shape = {tensor.shape}')
    print(f'alive: {nonzero}, pruned : {total - nonzero}, total: {total}, Compression rate : {total/nonzero:10.2f}x  ({100 * (total-nonzero) / total:6.2f}% pruned)')
    return (round(((total-nonzero)/total)*100,1))

## Choose dataset & model

In [15]:
# colab에서 돌리기 때문에 우선 arg가 아니라 변수로 넘겨주기
data_choice = "imdb" # 데이터셋 선택
arch_choice = "cnn" # 모델 선택
batch_size = 64 # batch size 선택 - 32 or 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} device.')

Using cuda device.


### load dataset

In [0]:
dataset = load_dataset(data_choice, arch_choice, batch_size, device, path)

### model setting

In [0]:
# check the trial number
trial = 601

# pruning setting
iteration = 20
percent = 20 # 20% prune
N_EPOCH = 20
learning_rate = 5e-3

step = 0

## 1) random initialization

In [0]:
reinit = True # random이면 True, lt면 False
mode = 'random'

In [0]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# embedding 같은 것 주기 위해
embedding_pretrained_weight = model.embedding.weight

# model initialization and save the model
model.apply(initialize_xavier_normal)

binaryCNN(
  (embedding): Embedding(50002, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5)
)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,120,801 trainable parameters


In [0]:
# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCEWithLogitsLoss()

criterion = criterion.to(device)

In [23]:
for name, param in model.named_parameters():
  print(name, param.size())

embedding.weight torch.Size([50002, 100])
convs.0.weight torch.Size([100, 1, 3, 100])
convs.0.bias torch.Size([100])
convs.1.weight torch.Size([100, 1, 4, 100])
convs.1.bias torch.Size([100])
convs.2.weight torch.Size([100, 1, 5, 100])
convs.2.bias torch.Size([100])
fc.weight torch.Size([1, 300])
fc.bias torch.Size([1])


In [0]:
print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # train 
  best_valid_loss = float('inf')
  # early stopping
  count = 0
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_state_dict = copy.deepcopy(model.state_dict())

    else:
        count +=1
        
    if count == 2:
        print(f'Early Stopping at {epoch}')
        break

  model.load_state_dict(best_state_dict)
  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(best_valid_loss, test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  print_nonzeros(model)

Iterative Pruning started
Running pruning iteration 0/20
0 0.4287474274318269 0.7951296542553191
1 0.31409712987852856 0.8680186170212766
2 0.32625685447945874 0.8732546542553191
3 0.4349267495439408 0.8681848404255319
Early Stopping at 3
0.31409712987852856 0.8645501592356688
embedding.weight     | nonzeros = 5000100 / 5000200 (100.00%) | total_pruned =     100 | shape = (50002, 100)
convs.0.weight       | nonzeros =   30000 /   30000 (100.00%) | total_pruned =       0 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =     100 /     100 (100.00%) | total_pruned =       0 | shape = (100,)
convs.1.weight       | nonzeros =   40000 /   40000 (100.00%) | total_pruned =       0 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =     100 /     100 (100.00%) | total_pruned =       0 | shape = (100,)
convs.2.weight       | nonzeros =   50000 /   50000 (100.00%) | total_pruned =       0 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =     100 /     100 (100.00%) 

0 0.5607617875917795 0.8478224734042553
1 0.6483960889596888 0.8525598404255319
2 0.7746037793286303 0.8483211436170213
Early Stopping at 2
0.5607617875917795 0.8528065286624203
embedding.weight     | nonzeros = 1048597 / 5000200 ( 20.97%) | total_pruned = 3951603 | shape = (50002, 100)
convs.0.weight       | nonzeros =    6291 /   30000 ( 20.97%) | total_pruned =   23709 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =      99 | shape = (100,)
convs.1.weight       | nonzeros =    8388 /   40000 ( 20.97%) | total_pruned =   31612 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =       6 /     100 (  6.00%) | total_pruned =      94 | shape = (100,)
convs.2.weight       | nonzeros =   10485 /   50000 ( 20.97%) | total_pruned =   39515 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =       4 /     100 (  4.00%) | total_pruned =      96 | shape = (100,)
fc.weight            | nonzeros =      61 /     300 ( 20.3

0 0.4510242946128896 0.827376994680851
1 0.4657111521413986 0.8236369680851063
2 0.48448096374247934 0.8226396276595744
Early Stopping at 2
0.4510242946128896 0.8279259554140127
embedding.weight     | nonzeros =  219906 / 5000200 (  4.40%) | total_pruned = 4780294 | shape = (50002, 100)
convs.0.weight       | nonzeros =    1319 /   30000 (  4.40%) | total_pruned =   28681 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =       0 /     100 (  0.00%) | total_pruned =     100 | shape = (100,)
convs.1.weight       | nonzeros =    1758 /   40000 (  4.39%) | total_pruned =   38242 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =       0 /     100 (  0.00%) | total_pruned =     100 | shape = (100,)
convs.2.weight       | nonzeros =    2198 /   50000 (  4.40%) | total_pruned =   47802 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =      99 | shape = (100,)
fc.weight            | nonzeros =      13 /     300 (  4.3

In [0]:
# random
print(performance)
print(valid_losses)

reinit_performance = performance
reinit_valid_losses = valid_losses

[86.45501592356688, 85.62898089171973, 86.92277070063695, 86.79339171974523, 86.06687898089172, 86.23606687898089, 85.73845541401273, 85.28065286624204, 85.51950636942675, 84.87261146496814, 84.86265923566879, 84.74323248407643, 84.00676751592357, 83.98686305732484, 82.79259554140127, 82.65326433121018, 83.25039808917197, 77.34872611464968, 82.03622611464968, 81.40923566878982, 80.83200636942675]
[0.4349267495439408, 0.45010001866265814, 0.5681587474301775, 0.665766840008028, 0.6950758562443105, 0.7426990697834086, 0.7701606823408857, 0.7746037793286303, 0.7274400180483118, 0.7117057705813266, 0.6661746925575301, 0.6333495900827519, 0.5623685133425479, 0.5195528150714458, 0.48448096374247934, 0.46336893673906937, 0.44753413195622727, 0.4934343784413439, 0.4393732081702415, 0.4422741903269545, 0.4601249094022081]


## 2) lt

In [0]:
reinit = False # random이면 True, lt면 False
mode = 'lt'

In [0]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# model initialization and save the state
model.apply(initialize_xavier_normal)
initial_state_dict = copy.deepcopy(model.state_dict())

# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCEWithLogitsLoss()

criterion = criterion.to(device)

In [0]:
print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # train 
  best_valid_loss = float('inf')
  # early stopping
  count = 0
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_state_dict = copy.deepcopy(model.state_dict())

    else:
        count +=1
        
    if count == 2:
        print(f'Early Stopping at {epoch}')
        break

  model.load_state_dict(best_state_dict)
  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(best_valid_loss, test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  print_nonzeros(model)

Iterative Pruning started
Running pruning iteration 0/20
0 0.39476124014943204 0.8192320478723404
1 0.3072912580868665 0.8663563829787234
2 0.3191868526504395 0.8785738031914894
3 0.45132775422423443 0.8617021276595744
Early Stopping at 3
0.3072912580868665 0.870421974522293
embedding.weight     | nonzeros = 5000100 / 5000200 (100.00%) | total_pruned =     100 | shape = (50002, 100)
convs.0.weight       | nonzeros =   30000 /   30000 (100.00%) | total_pruned =       0 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =     100 /     100 (100.00%) | total_pruned =       0 | shape = (100,)
convs.1.weight       | nonzeros =   40000 /   40000 (100.00%) | total_pruned =       0 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =     100 /     100 (100.00%) | total_pruned =       0 | shape = (100,)
convs.2.weight       | nonzeros =   50000 /   50000 (100.00%) | total_pruned =       0 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =     100 /     100 (100.00%) | 

0 0.3402963256106732 0.8535571808510638
1 0.3563147897019665 0.8505651595744681
2 0.3501608201322403 0.8561336436170213
Early Stopping at 2
0.3402963256106732 0.8577826433121019
embedding.weight     | nonzeros = 1048597 / 5000200 ( 20.97%) | total_pruned = 3951603 | shape = (50002, 100)
convs.0.weight       | nonzeros =    6291 /   30000 ( 20.97%) | total_pruned =   23709 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =       2 /     100 (  2.00%) | total_pruned =      98 | shape = (100,)
convs.1.weight       | nonzeros =    8388 /   40000 ( 20.97%) | total_pruned =   31612 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =       6 /     100 (  6.00%) | total_pruned =      94 | shape = (100,)
convs.2.weight       | nonzeros =   10485 /   50000 ( 20.97%) | total_pruned =   39515 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =       6 /     100 (  6.00%) | total_pruned =      94 | shape = (100,)
fc.weight            | nonzeros =      61 /     300 ( 20.3

0.3630644922402311 0.841062898089172
embedding.weight     | nonzeros =  274883 / 5000200 (  5.50%) | total_pruned = 4725317 | shape = (50002, 100)
convs.0.weight       | nonzeros =    1649 /   30000 (  5.50%) | total_pruned =   28351 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =      99 | shape = (100,)
convs.1.weight       | nonzeros =    2198 /   40000 (  5.50%) | total_pruned =   37802 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =      99 | shape = (100,)
convs.2.weight       | nonzeros =    2748 /   50000 (  5.50%) | total_pruned =   47252 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =      99 | shape = (100,)
fc.weight            | nonzeros =      16 /     300 (  5.33%) | total_pruned =     284 | shape = (1, 300)
fc.bias              | nonzeros =       1 /       1 (100.00%) | total_pruned =       0 | shap

0 0.5224242795338022 0.757064494680851
1 0.5053726122417348 0.7670378989361702
2 0.5018606729646946 0.7677027925531915
3 0.5002951487264735 0.7708610372340425
4 0.4999566266828395 0.7674534574468085
5 0.49938332051672835 0.773188164893617
6 0.4994521475535758 0.7743517287234043
7 0.5012094351522466 0.7746841755319149
Early Stopping at 7
0.49938332051672835 0.7746815286624203
embedding.weight     | nonzeros =   72058 / 5000200 (  1.44%) | total_pruned = 4928142 | shape = (50002, 100)
convs.0.weight       | nonzeros =     432 /   30000 (  1.44%) | total_pruned =   29568 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =      99 | shape = (100,)
convs.1.weight       | nonzeros =     576 /   40000 (  1.44%) | total_pruned =   39424 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =      99 | shape = (100,)
convs.2.weight       | nonzeros =     720 /   50000 (  1.44%) | total_pruned

In [0]:
# lt
print(performance)
print(valid_losses)
lt_performance = performance
lt_valid_losses = valid_losses

[87.0421974522293, 86.56449044585987, 87.54976114649682, 86.95262738853503, 86.92277070063695, 86.21616242038218, 86.30573248407643, 85.77826433121018, 85.75835987261146, 85.17117834394905, 84.93232484076432, 84.30533439490446, 83.93710191082803, 84.1062898089172, 83.51910828025477, 81.88694267515923, 81.5187101910828, 80.56329617834395, 79.85668789808918, 77.46815286624204, 76.09474522292994]
[0.45132775422423443, 0.4467437599012826, 0.44423610296972255, 0.38790360077264463, 0.38274828308915837, 0.3724279914923171, 0.3551655703006273, 0.3501608201322403, 0.3604895598156021, 0.38633824781851567, 0.3661482798609328, 0.36344673746126765, 0.36716622019067724, 0.3754595856955077, 0.3825408729942555, 0.407268854531836, 0.4147799167227238, 0.43417300640585577, 0.4564403978751061, 0.5012094351522466, 0.5196707792421604]


## 3) lt+lr (first epoch)

In [0]:
reinit = False # random이면 True, lt면 False
mode = 'lr_f'

In [0]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# model initialization and save the state
model.apply(initialize_xavier_normal)

# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCEWithLogitsLoss()

criterion = criterion.to(device)

In [0]:
print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # train 
  best_valid_loss = float('inf')
  # early stopping
  count = 0

  # train 
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_state_dict = copy.deepcopy(model.state_dict())

    else:
        count +=1
        
    if count == 2:
        print(f'Early Stopping at {epoch}')
        break
    
    # late rewinding after the first epoch
    if pruning_iter == 0 and epoch == 0:
        initial_state_dict = copy.deepcopy(model.state_dict())

  model.load_state_dict(best_state_dict)
  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(best_valid_loss, test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  print_nonzeros(model)

Iterative Pruning started
Running pruning iteration 0/20
0 0.3924252216486221 0.8179022606382979
1 0.29334738533547583 0.879654255319149
2 0.4285250069930198 0.8422539893617021
3 0.41461936713374675 0.8729222074468085
Early Stopping at 3
0.29334738533547583 0.8790804140127388
embedding.weight     | nonzeros = 5000100 / 5000200 (100.00%) | total_pruned =     100 | shape = (50002, 100)
convs.0.weight       | nonzeros =   30000 /   30000 (100.00%) | total_pruned =       0 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =     100 /     100 (100.00%) | total_pruned =       0 | shape = (100,)
convs.1.weight       | nonzeros =   40000 /   40000 (100.00%) | total_pruned =       0 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =     100 /     100 (100.00%) | total_pruned =       0 | shape = (100,)
convs.2.weight       | nonzeros =   50000 /   50000 (100.00%) | total_pruned =       0 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =     100 /     100 (100.00%) |

0 0.3260470411561905 0.8597905585106383
1 0.31722149498602176 0.8698470744680851
2 0.33349435142380124 0.866938164893617
3 0.35906831650657856 0.8660239361702128
Early Stopping at 3
0.31722149498602176 0.8697253184713376
embedding.weight     | nonzeros = 1048597 / 5000200 ( 20.97%) | total_pruned = 3951603 | shape = (50002, 100)
convs.0.weight       | nonzeros =    6291 /   30000 ( 20.97%) | total_pruned =   23709 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =      21 /     100 ( 21.00%) | total_pruned =      79 | shape = (100,)
convs.1.weight       | nonzeros =    8388 /   40000 ( 20.97%) | total_pruned =   31612 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =      21 /     100 ( 21.00%) | total_pruned =      79 | shape = (100,)
convs.2.weight       | nonzeros =   10485 /   50000 ( 20.97%) | total_pruned =   39515 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =      21 /     100 ( 21.00%) | total_pruned =      79 | shape = (100,)
fc.weight      

3 0.3552327867835126 0.8464095744680851
4 0.35538632153196537 0.8454122340425532
5 0.3579772476186144 0.8460771276595744
Early Stopping at 5
0.3552327867835126 0.8466361464968153
embedding.weight     | nonzeros =  274883 / 5000200 (  5.50%) | total_pruned = 4725317 | shape = (50002, 100)
convs.0.weight       | nonzeros =    1649 /   30000 (  5.50%) | total_pruned =   28351 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =       5 /     100 (  5.00%) | total_pruned =      95 | shape = (100,)
convs.1.weight       | nonzeros =    2198 /   40000 (  5.50%) | total_pruned =   37802 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =       5 /     100 (  5.00%) | total_pruned =      95 | shape = (100,)
convs.2.weight       | nonzeros =    2748 /   50000 (  5.50%) | total_pruned =   47252 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =       5 /     100 (  5.00%) | total_pruned =      95 | shape = (100,)
fc.weight            | nonzeros =      16 /     300 (  5.

0 0.5309719719151234 0.7675365691489362
1 0.50820035519118 0.7712765957446809
2 0.48718267171940904 0.7840757978723404
3 0.4898674873278496 0.7884807180851063
4 0.4884064126204937 0.7838264627659575
Early Stopping at 4
0.48718267171940904 0.790406050955414
embedding.weight     | nonzeros =   72058 / 5000200 (  1.44%) | total_pruned = 4928142 | shape = (50002, 100)
convs.0.weight       | nonzeros =     432 /   30000 (  1.44%) | total_pruned =   29568 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =      99 | shape = (100,)
convs.1.weight       | nonzeros =     576 /   40000 (  1.44%) | total_pruned =   39424 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =      99 | shape = (100,)
convs.2.weight       | nonzeros =     720 /   50000 (  1.44%) | total_pruned =   49280 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =       1 /     100 (  1.00%) | total_pruned =     

In [0]:
# lr_f
print(performance)
print(valid_losses)

lr_f_performance = performance
lr_f_valid_losses = valid_losses

[87.90804140127389, 87.46019108280255, 87.26114649681529, 87.37062101910828, 87.54976114649682, 86.3953025477707, 86.83320063694268, 86.97253184713377, 86.51472929936305, 86.40525477707006, 86.08678343949045, 85.92754777070064, 85.1015127388535, 84.66361464968153, 83.57882165605095, 83.04140127388536, 82.52388535031847, 80.94148089171973, 79.98606687898089, 79.04060509554141, 78.56289808917197]
[0.41461936713374675, 0.40466037313354775, 0.38330680508404336, 0.35390813276171684, 0.39517574503700786, 0.3877002265938419, 0.3444429377212803, 0.35906831650657856, 0.3409604744074192, 0.3314934919409929, 0.33766223054299965, 0.3381820966271644, 0.34879168400422056, 0.3579772476186144, 0.37559928047530194, 0.3929621987044811, 0.4125264781586667, 0.4475723406735887, 0.45869700911831346, 0.4884064126204937, 0.5054004363557125]


## 4) lt+lr (last epoch)

In [0]:
reinit = False # random이면 True, lt면 False
mode = 'lr_l'

In [0]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# model initialization and save the state
model.apply(initialize_xavier_normal)

# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCEWithLogitsLoss()

criterion = criterion.to(device)

In [25]:
print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # train 
  best_valid_loss = float('inf')
  # early stopping
  count = 0

  # train 
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_state_dict = copy.deepcopy(model.state_dict())

    else:
        count +=1
        
    if count == 2:
        print(f'Early Stopping at {epoch}')
        break

  model.load_state_dict(best_state_dict)
  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(best_valid_loss, test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  # late rewinding after the last epoch
  if pruning_iter == 0:
     initial_state_dict = copy.deepcopy(model.state_dict())

  print_nonzeros(model)

0 0.35906656458973885 0.8467420212765957
1 0.30482809443740133 0.8738364361702128
2 0.30886234339088836 0.8804853723404256
3 0.4418924398958049 0.866688829787234
Early Stopping at 3
0.30482809443740133 0.870421974522293
embedding.weight     | nonzeros = 5000100 / 5000200 (100.00%) | total_pruned =     100 | shape = (50002, 100)
convs.0.weight       | nonzeros =   30000 /   30000 (100.00%) | total_pruned =       0 | shape = (100, 1, 3, 100)
convs.0.bias         | nonzeros =     100 /     100 (100.00%) | total_pruned =       0 | shape = (100,)
convs.1.weight       | nonzeros =   40000 /   40000 (100.00%) | total_pruned =       0 | shape = (100, 1, 4, 100)
convs.1.bias         | nonzeros =     100 /     100 (100.00%) | total_pruned =       0 | shape = (100,)
convs.2.weight       | nonzeros =   50000 /   50000 (100.00%) | total_pruned =       0 | shape = (100, 1, 5, 100)
convs.2.bias         | nonzeros =     100 /     100 (100.00%) | total_pruned =       0 | shape = (100,)
fc.weight       

KeyboardInterrupt: ignored

In [0]:
# lr_l
print(performance)
print(valid_losses)

lr_l_performance = performance
lr_l_valid_losses = valid_losses