[reference](https://github.com/bentrevett/pytorch-sentiment-analysis)

In [0]:
# Importing Libraries
import os
import copy
import time
import torch
import torch.nn as nn
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
path = '/content/drive/My Drive/lt-module'
os.chdir(path)

In [0]:
# Custom Libraries
import DataLoader
import Model

## Define functions - to load dataset & model

In [0]:
# load dataset
def load_dataset(dataset, architecture, batch_size, device, path):
    if dataset == "imdb":
      if architecture == "cnn":
        data = DataLoader.IMDB_CNN_CUSTOM(batch_size, device, path)
      elif architecture == "lstm":
        data = DataLoader.IMDB_LSTM(batch_size, device, path)

    elif dataset == "agnews":
        data = DataLoader.AGNEWS(batch_size, device, path)

    else:
        raise ValueError(dataset + "is not supported")

    return data

# load model and set hyperparameters
def load_model(architecture, data_choice, batch_size):

    if architecture == "cnn":
      # hyperparameters
      vocab_size = len(dataset.TEXT.vocab)
      embedding_dim = 100
      n_filters = 100
      filter_sizes = [3,4,5]
      dropout = 0.5
      pad_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.pad_token]
      
      if data_choice == "imdb":
        # binary-class
        output_dim = 1
        model = Model.binaryCNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)
      
      elif data_choice == "agnews":
        # multi-class
        output_dim = 4
        model = Model.multiCNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)

      unk_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token]
      model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
      model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

      return model

    elif architecture == "lstm":
      # hyperparameters
      vocab_size = len(dataset.TEXT.vocab)
      embedding_dim = 100
      hidden_dim = 256
      output_dim = 1
      n_layers = 2
      bidirectional = True
      dropout = 0.5
      pad_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.pad_token]
      
      if data_choice == "imdb":
        # binary-class
        output_dim = 1
      
      elif data_choice == "agnews":
        # multi-class
        output_dim = 4
      
      model = Model.LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx)

      unk_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token]
      model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
      model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

      return model

    # temporary
    elif architecture == "bert":
        return None

    else:
        raise ValueError(architecture + "is not supported")

## Weight initialization

In [0]:
# weight initializtion
def initialize_xavier_normal(m):
    
  """
	Function to initialize a layer by picking weights from a xavier normal distribution
	Arguments
	---------
	m : The layer of the neural network
	Returns
	-------
	None
	"""
  
  if type(m) == torch.nn.Conv2d:
    torch.nn.init.xavier_normal_(m.weight.data)
    m.bias.data.fill_(0)

  elif isinstance(m, torch.nn.Linear):
    torch.nn.init.xavier_normal_(m.weight.data)
    m.bias.data.fill_(0)

  elif type(m) in [torch.nn.GRU, torch.nn.LSTM, torch.nn.RNN]:
    for name, param in m.named_parameters():
        if 'weight_ih' in name:
          torch.nn.init.xavier_normal_(param.data)
        elif 'weight_hh' in name:
          torch.nn.init.orthogonal_(param.data)
        elif 'bias' in name:
          param.data.fill_(0)

## train and test functions

In [0]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [0]:
def train(model, iterator, optimizer, criterion):
    # EPS = 1e-6
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
      
      optimizer.zero_grad()
      text, text_lengths = batch.text
      target = batch.label
      predictions = model(text, text_lengths).squeeze(1)
      loss = criterion(predictions, target)
      acc = categorical_accuracy(predictions, target)
      loss.backward()
      step = 0

      # Freezing Pruned weights by making their gradients Zero
      for name, p in model.named_parameters():
        weight_dev = param.device
        tensor = p.data.cpu().numpy()
        grad_tensor = p.grad.data.cpu().numpy()
        grad_tensor = np.where(mask[step] == 0, 0, grad_tensor)
        p.grad.data = torch.from_numpy(grad_tensor).to(device)
        step += 1
      
      step = 0

      optimizer.step()
        
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
          text, text_lengths = batch.text
          target = batch.label
          
          predictions = model(text, text_lengths).squeeze(1)
            
          loss = criterion(predictions, batch.label)
            
          acc = categorical_accuracy(predictions, batch.label)

          epoch_loss += loss.item()
          epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Pruning Functions

In [0]:
# Prune by Percentile module
def prune_by_percentile(percent):
  global step
  global mask
  global model
  # Calculate percentile value
  step = 0
  for name, param in model.named_parameters():
    # if 'weight' in name:
    tensor = param.data.cpu().numpy()
    nz_count = np.count_nonzero(tensor)

    # bias가 all pruned 되는 경우 발생
    if nz_count == 0:
      step += 1

    else: 
      alive = tensor[np.nonzero(tensor)] # flattened array of nonzero values
      percentile_value = np.percentile(abs(alive), percent)

      # Convert Tensors to numpy and calculate
      weight_dev = param.device
      new_mask = np.where(abs(tensor) < percentile_value, 0, mask[step])
                
      # Apply new weight and mask
      param.data = torch.from_numpy(tensor * new_mask).to(weight_dev)
      mask[step] = new_mask
      step += 1

  step = 0

In [0]:
# Function to make an empty mask of the same size as the model
def make_mask(model):
  global step
  global mask
  
  step = 0
  for name, param in model.named_parameters(): 
    # if 'weight' in name:
    step += 1
  mask = [None]* step 
  step = 0
  for name, param in model.named_parameters(): 
    # if 'weight' in name:
    tensor = param.data.cpu().numpy()
    mask[step] = np.ones_like(tensor)
    step += 1
  step = 0

In [0]:
def original_initialization(mask_temp, initial_state_dict):
    global step
    global model
    
    step = 0
    for name, param in model.named_parameters(): 
        # if "weight" in name: 
            weight_dev = param.device
            param.data = torch.from_numpy(mask_temp[step] * initial_state_dict[name].cpu().numpy()).to(weight_dev)
            step = step + 1
        # if "bias" in name:
            # param.data = initial_state_dict[name]
    step = 0

In [0]:
# ANCHOR Print table of zeros and non-zeros count
def print_nonzeros(model):
    nonzero = total = 0
    for name, p in model.named_parameters():
        tensor = p.data.cpu().numpy()
        nz_count = np.count_nonzero(tensor)
        total_params = np.prod(tensor.shape)
        nonzero += nz_count
        total += total_params
        print(f'{name:20} | nonzeros = {nz_count:7} / {total_params:7} ({100 * nz_count / total_params:6.2f}%) | total_pruned = {total_params - nz_count :7} | shape = {tensor.shape}')
    print(f'alive: {nonzero}, pruned : {total - nonzero}, total: {total}, Compression rate : {total/nonzero:10.2f}x  ({100 * (total-nonzero) / total:6.2f}% pruned)')
    return (round(((total-nonzero)/total)*100,1))

## Choose dataset & model

In [12]:
# colab에서 돌리기 때문에 우선 arg가 아니라 변수로 넘겨주기
data_choice = "agnews" # 데이터셋 선택
arch_choice = "lstm" # 모델 선택
batch_size = 64 # batch size 선택 - 32 or 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} device.')

Using cuda device.


### load dataset

In [0]:
dataset = load_dataset(data_choice, arch_choice, batch_size, device, path)

### model setting

In [0]:
# check the trial number
trial = 201

# pruning setting
iteration = 20
percent = 20 # 20% prune
N_EPOCH = 10
learning_rate = 1e-3

step = 0

## 1) random initialization

In [0]:
reinit = True # random이면 True, lt면 False
mode = 'random'

In [0]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# embedding 같은 것 주기 위해
embedding_pretrained_weight = model.embedding.weight

# model initialization and save the model
model.apply(initialize_xavier_normal)

LSTM(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=4, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,812,396 trainable parameters


In [0]:
# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

criterion = criterion.to(device)

In [0]:
for name, param in model.named_parameters():
  print(name, param.size())

embedding.weight torch.Size([25002, 100])
rnn.weight_ih_l0 torch.Size([1024, 100])
rnn.weight_hh_l0 torch.Size([1024, 256])
rnn.bias_ih_l0 torch.Size([1024])
rnn.bias_hh_l0 torch.Size([1024])
rnn.weight_ih_l0_reverse torch.Size([1024, 100])
rnn.weight_hh_l0_reverse torch.Size([1024, 256])
rnn.bias_ih_l0_reverse torch.Size([1024])
rnn.bias_hh_l0_reverse torch.Size([1024])
rnn.weight_ih_l1 torch.Size([1024, 512])
rnn.weight_hh_l1 torch.Size([1024, 256])
rnn.bias_ih_l1 torch.Size([1024])
rnn.bias_hh_l1 torch.Size([1024])
rnn.weight_ih_l1_reverse torch.Size([1024, 512])
rnn.weight_hh_l1_reverse torch.Size([1024, 256])
rnn.bias_ih_l1_reverse torch.Size([1024])
rnn.bias_hh_l1_reverse torch.Size([1024])
fc.weight torch.Size([4, 512])
fc.bias torch.Size([4])


In [0]:
print(list(model.named_parameters()))

[('embedding.weight', Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.4133, -0.1122, -0.5961,  ...,  1.1373,  0.7257, -0.3588],
        ...,
        [ 0.3156,  0.5453, -0.4841,  ..., -0.6731, -0.0976,  0.4093],
        [-0.7371, -0.1087,  0.1321,  ..., -0.1678,  0.1071,  0.3718],
        [ 1.2929, -0.9231, -1.1145,  ...,  0.2666, -1.6894,  0.2001]],
       device='cuda:0', requires_grad=True)), ('rnn.weight_ih_l0', Parameter containing:
tensor([[-0.0182, -0.0061,  0.0979,  ...,  0.0072, -0.0296,  0.0094],
        [-0.0077,  0.0336,  0.0634,  ..., -0.0570,  0.0152,  0.0356],
        [-0.0317,  0.0618,  0.0462,  ...,  0.0386,  0.0099,  0.0388],
        ...,
        [ 0.0419, -0.0476,  0.0191,  ...,  0.0508, -0.0260,  0.0005],
        [ 0.1052,  0.0882, -0.0405,  ...,  0.0046, -0.0149,  0.0206],
        [ 0.0042,  0.0425,  0.0306,  ...,  0.0008, -0.0264, -0.0441]],

In [0]:
print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # train 
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)

  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  print_nonzeros(model)

Iterative Pruning started
Running pruning iteration 0/20
0 0.46065060562763505 0.8416130106571936
1 0.3530784971965725 0.8776642984014209
2 0.3157103949955947 0.8911800621669627
3 0.288157759588311 0.90089365008881
4 0.2639991523727748 0.9088310390763765
5 0.263289320274367 0.9116896092362344
6 0.25418194656658977 0.9139376110124334
7 0.2661443572378921 0.9131882770870338
8 0.2517628838766373 0.9156860568383659
9 0.2579735633396075 0.9164076376554174
0.9167104343406293
embedding.weight     | nonzeros = 2500100 / 2500200 (100.00%) | total_pruned =     100 | shape = (25002, 100)
rnn.weight_ih_l0     | nonzeros =  102400 /  102400 (100.00%) | total_pruned =       0 | shape = (1024, 100)
rnn.weight_hh_l0     | nonzeros =  262144 /  262144 (100.00%) | total_pruned =       0 | shape = (1024, 256)
rnn.bias_ih_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 | shape = (1024,)
rnn.bias_hh_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 | shap

In [0]:
# random
print(performance)
print(valid_losses)

reinit_performance = performance
reinit_valid_losses = valid_losses

[91.67104343406292, 91.99929973658394, 91.72356444246628, 91.54849438106312, 91.4434523642564, 91.11957284582763, 91.08018208952511, 90.79569327731093, 90.83508403361344, 90.83508403361344, 90.8263305155169, 90.58998597770179, 90.6862745264999, 90.58123250969318, 90.46306024078562, 90.15231092436974, 90.06477589366817, 89.49579831932773, 89.63147757434044, 89.25070026341606, 88.81740194408833]
[0.2579735633396075, 0.27358067152793414, 0.2978634451372056, 0.334012394810681, 0.36997072915769913, 0.44524972337640517, 0.5073511471730354, 0.481611964190263, 0.4754104249023867, 0.45782866703634206, 0.4740623618454588, 0.4622044965143433, 0.4747044229119037, 0.45975155888710895, 0.4330999290464747, 0.43187101953373447, 0.4221988604296508, 0.4296411497429093, 0.43029266677558636, 0.4259929670580062, 0.44278307254450994]


## 2) lt

In [0]:
reinit = False # random이면 True, lt면 False
mode = 'lt'

In [0]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# model initialization and save the state
model.apply(initialize_xavier_normal)
initial_state_dict = copy.deepcopy(model.state_dict())

# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

criterion = criterion.to(device)

In [0]:
for name, param in model.named_parameters():
  print(name, param.size())

embedding.weight torch.Size([25002, 100])
rnn.weight_ih_l0 torch.Size([1024, 100])
rnn.weight_hh_l0 torch.Size([1024, 256])
rnn.bias_ih_l0 torch.Size([1024])
rnn.bias_hh_l0 torch.Size([1024])
rnn.weight_ih_l0_reverse torch.Size([1024, 100])
rnn.weight_hh_l0_reverse torch.Size([1024, 256])
rnn.bias_ih_l0_reverse torch.Size([1024])
rnn.bias_hh_l0_reverse torch.Size([1024])
rnn.weight_ih_l1 torch.Size([1024, 512])
rnn.weight_hh_l1 torch.Size([1024, 256])
rnn.bias_ih_l1 torch.Size([1024])
rnn.bias_hh_l1 torch.Size([1024])
rnn.weight_ih_l1_reverse torch.Size([1024, 512])
rnn.weight_hh_l1_reverse torch.Size([1024, 256])
rnn.bias_ih_l1_reverse torch.Size([1024])
rnn.bias_hh_l1_reverse torch.Size([1024])
fc.weight torch.Size([4, 512])
fc.bias torch.Size([4])


In [0]:
print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # train 
  count = 0
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)

  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  print_nonzeros(model)

Iterative Pruning started
Running pruning iteration 0/20
0 0.4425300905914385 0.8448046181172292
1 0.34368463854436127 0.8834646980461812
2 0.2967500912875837 0.8958425843694494
3 0.2879317391193993 0.9011156749555951
4 0.2766686928810859 0.9068883214920072
5 0.2941314962727297 0.9080261989342806
6 0.2592108170890628 0.9151032415630551
7 0.24688043241880397 0.916934946714032
8 0.24764901322526048 0.9180450710479574
9 0.25478339226019847 0.9182115896980462
0.9174982494666797
embedding.weight     | nonzeros = 2500100 / 2500200 (100.00%) | total_pruned =     100 | shape = (25002, 100)
rnn.weight_ih_l0     | nonzeros =  102400 /  102400 (100.00%) | total_pruned =       0 | shape = (1024, 100)
rnn.weight_hh_l0     | nonzeros =  262144 /  262144 (100.00%) | total_pruned =       0 | shape = (1024, 256)
rnn.bias_ih_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 | shape = (1024,)
rnn.bias_hh_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 |

In [0]:
# lt
print(performance)
print(valid_losses)
lt_performance = performance
lt_valid_losses = valid_losses

[91.74982494666797, 91.87675068358413, 91.82422967518077, 91.88988093568497, 91.78046218487394, 91.29464285714286, 91.65353639786986, 91.68417368616376, 91.87237394957984, 91.49159663865547, 91.32090336134453, 91.36029411764706, 90.86572127181942, 90.51120446509674, 90.33613445378151, 89.85031512605042, 89.21130950711354, 88.12587536683604, 87.53939075630252, 86.31827731092437, 85.66176470588235]
[0.25478339226019847, 0.25439854130665, 0.25063009309991, 0.26199866418245366, 0.25844997642677586, 0.2707624379512366, 0.25693596065289226, 0.2641534540623585, 0.2589743698419437, 0.265988693944779, 0.26559453369589725, 0.2676521922688316, 0.2753369511397916, 0.28526257577999026, 0.2939089261124417, 0.29952694776552347, 0.3082628239667437, 0.3295927064536308, 0.34389051455561054, 0.3646020691587092, 0.39220759259672505]


## 3) lt+lr (last epoch)

In [0]:
reinit = False # random이면 True, lt면 False
mode = 'lr_l'

In [0]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# model initialization and save the state
model.apply(initialize_xavier_normal)

# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

criterion = criterion.to(device)

In [20]:
for name, param in model.named_parameters():
  print(name, param.size())

embedding.weight torch.Size([25002, 100])
rnn.weight_ih_l0 torch.Size([1024, 100])
rnn.weight_hh_l0 torch.Size([1024, 256])
rnn.bias_ih_l0 torch.Size([1024])
rnn.bias_hh_l0 torch.Size([1024])
rnn.weight_ih_l0_reverse torch.Size([1024, 100])
rnn.weight_hh_l0_reverse torch.Size([1024, 256])
rnn.bias_ih_l0_reverse torch.Size([1024])
rnn.bias_hh_l0_reverse torch.Size([1024])
rnn.weight_ih_l1 torch.Size([1024, 512])
rnn.weight_hh_l1 torch.Size([1024, 256])
rnn.bias_ih_l1 torch.Size([1024])
rnn.bias_hh_l1 torch.Size([1024])
rnn.weight_ih_l1_reverse torch.Size([1024, 512])
rnn.weight_hh_l1_reverse torch.Size([1024, 256])
rnn.bias_ih_l1_reverse torch.Size([1024])
rnn.bias_hh_l1_reverse torch.Size([1024])
fc.weight torch.Size([4, 512])
fc.bias torch.Size([4])


In [21]:

print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
  
  # train 
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)

  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  # late rewinding after the last epoch
  if pruning_iter == 0:
     initial_state_dict = copy.deepcopy(model.state_dict())

  print_nonzeros(model)

Iterative Pruning started
Running pruning iteration 0/20
0 0.43145262798357287 0.8448601243339254
1 0.3526589562934468 0.8822990674955595
2 0.3373533128156967 0.8864897868561279
3 0.29217160249304286 0.9013654529307282
4 0.269145242290879 0.9060002220248667
5 0.2561660493669459 0.9128829928952042
6 0.25366810691041697 0.9154085257548845
7 0.2626210951823642 0.9137433392539964
8 0.25327873740140117 0.9176287744227354
9 0.2571922519159889 0.9182115896980462
0.9155287116515536
embedding.weight     | nonzeros = 2500100 / 2500200 (100.00%) | total_pruned =     100 | shape = (25002, 100)
rnn.weight_ih_l0     | nonzeros =  102400 /  102400 (100.00%) | total_pruned =       0 | shape = (1024, 100)
rnn.weight_hh_l0     | nonzeros =  262144 /  262144 (100.00%) | total_pruned =       0 | shape = (1024, 256)
rnn.bias_ih_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 | shape = (1024,)
rnn.bias_hh_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 |

In [22]:
# lr_l
print(performance)
print(valid_losses)

lr_l_performance = performance
lr_l_valid_losses = valid_losses

[91.55287116515535, 92.20938377019738, 92.26190477860075, 92.35381654330662, 91.95553219618917, 92.29691875081102, 92.35819327731093, 92.30129553490326, 92.16999301389485, 92.02118345669338, 92.17874648190346, 91.85924369747899, 91.9292716919875, 91.64478292986125, 91.66666664997068, 91.31214984324801, 90.66876749030682, 89.98161764705883, 89.62710084033614, 88.7955182239789, 88.047093854231]
[0.2571922519159889, 0.27851588419947915, 0.2837045765192402, 0.2766826276933745, 0.2947630960490971, 0.2813820012582079, 0.2825452032173135, 0.2697640933777945, 0.25813463290898564, 0.2541146230061752, 0.2540732260783522, 0.24683570512446373, 0.24654309607737865, 0.2500740755713325, 0.26294100680099025, 0.26627229705624955, 0.2836239071691915, 0.2972433092502702, 0.30318846749257605, 0.3397329295563576, 0.3405035078942405]
