[reference](https://github.com/bentrevett/pytorch-sentiment-analysis)

In [0]:
# Importing Libraries
import os
import copy
import time
import torch
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
import pandas as pd
from tqdm import tqdm

In [0]:
!pwd

/content


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
path = '/content/drive/My Drive/lt-module'
os.chdir(path)

In [0]:
# Custom Libraries
import DataLoader
import Model

## Define functions - to load dataset & model

In [0]:
# load dataset
def load_dataset(dataset, architecture, batch_size, device, path):
    if dataset == "imdb":
      if architecture == "cnn":
        data = DataLoader.IMDB_CNN_CUSTOM(batch_size, device, path)
      elif architecture == "lstm":
        data = DataLoader.IMDB_LSTM(batch_size, device, path)

    elif dataset == "agnews":
        data = DataLoader.AGNEWS(batch_size, device, path)

    else:
        raise ValueError(dataset + "is not supported")

    return data

# load model and set hyperparameters
def load_model(architecture, data_choice, batch_size):

    if architecture == "cnn":
      # hyperparameters
      vocab_size = len(dataset.TEXT.vocab)
      embedding_dim = 100
      n_filters = 100
      filter_sizes = [3,4,5]
      dropout = 0.5
      pad_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.pad_token]
      
      if data_choice == "imdb":
        # binary-class
        output_dim = 1
        model = Model.binaryCNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)
      
      elif data_choice == "agnews":
        # multi-class
        output_dim = 4
        model = Model.multiCNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)

      unk_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token]
      model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
      model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

      return model

    elif architecture == "lstm":
      # hyperparameters
      vocab_size = len(dataset.TEXT.vocab)
      embedding_dim = 100
      hidden_dim = 256
      output_dim = 1
      n_layers = 2
      bidirectional = True
      dropout = 0.5
      pad_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.pad_token]
      
      if data_choice == "imdb":
        # binary-class
        output_dim = 1
      
      elif data_choice == "agnews":
        # multi-class
        output_dim = 4
      
      model = Model.LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx)

      unk_idx = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token]
      model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
      model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

      return model

    # temporary
    elif architecture == "bert":
        return None

    else:
        raise ValueError(architecture + "is not supported")

## Weight initialization

In [0]:
# weight initializtion
def initialize_xavier_normal(m):
    
  """
	Function to initialize a layer by picking weights from a xavier normal distribution
	Arguments
	---------
	m : The layer of the neural network
	Returns
	-------
	None
	"""
  
  if type(m) == torch.nn.Conv2d:
    torch.nn.init.xavier_normal_(m.weight.data)
    m.bias.data.fill_(0)

  elif isinstance(m, torch.nn.Linear):
    torch.nn.init.xavier_normal_(m.weight.data)
    m.bias.data.fill_(0)

  elif type(m) in [torch.nn.GRU, torch.nn.LSTM, torch.nn.RNN]:
    for name, param in m.named_parameters():
        if 'weight_ih' in name:
          torch.nn.init.xavier_normal_(param.data)
        elif 'weight_hh' in name:
          torch.nn.init.orthogonal_(param.data)
        elif 'bias' in name:
          param.data.fill_(0)

## train and test functions

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    # EPS = 1e-6
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
      
      optimizer.zero_grad()
      text, text_lengths = batch.text
      target = batch.label
      predictions = model(text, text_lengths).squeeze(1)
      loss = criterion(predictions, target)
      acc = binary_accuracy(predictions, target)
      loss.backward()
      step = 0

      # Freezing Pruned weights by making their gradients Zero
      for name, p in model.named_parameters():
        weight_dev = param.device
        tensor = p.data.cpu().numpy()
        grad_tensor = p.grad.data.cpu().numpy()
        grad_tensor = np.where(mask[step] == 0, 0, grad_tensor)
        p.grad.data = torch.from_numpy(grad_tensor).to(device)
        step += 1
      
      step = 0

      optimizer.step()
        
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
          text, text_lengths = batch.text
          target = batch.label
          
          predictions = model(text, text_lengths).squeeze(1)
            
          loss = criterion(predictions, batch.label)
            
          acc = binary_accuracy(predictions, batch.label)

          epoch_loss += loss.item()
          epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Pruning Functions

In [0]:
# Prune by Percentile module
def prune_by_percentile(percent):
  global step
  global mask
  global model
  # Calculate percentile value
  step = 0
  for name, param in model.named_parameters():
    # if 'weight' in name:
    tensor = param.data.cpu().numpy()
    nz_count = np.count_nonzero(tensor)

    # bias가 all pruned 되는 경우 발생
    if nz_count == 0:
      step += 1

    else: 
      alive = tensor[np.nonzero(tensor)] # flattened array of nonzero values
      percentile_value = np.percentile(abs(alive), percent)

      # Convert Tensors to numpy and calculate
      weight_dev = param.device
      new_mask = np.where(abs(tensor) < percentile_value, 0, mask[step])
                
      # Apply new weight and mask
      param.data = torch.from_numpy(tensor * new_mask).to(weight_dev)
      mask[step] = new_mask
      step += 1

  step = 0

In [0]:
# Function to make an empty mask of the same size as the model
def make_mask(model):
  global step
  global mask
  
  step = 0
  for name, param in model.named_parameters(): 
    # if 'weight' in name:
    step += 1
  mask = [None]* step 
  step = 0
  for name, param in model.named_parameters(): 
    # if 'weight' in name:
    tensor = param.data.cpu().numpy()
    mask[step] = np.ones_like(tensor)
    step += 1
  step = 0

In [0]:
def original_initialization(mask_temp, initial_state_dict):
    global step
    global model
    
    step = 0
    for name, param in model.named_parameters(): 
        # if "weight" in name: 
            weight_dev = param.device
            param.data = torch.from_numpy(mask_temp[step] * initial_state_dict[name].cpu().numpy()).to(weight_dev)
            step = step + 1
        # if "bias" in name:
            # param.data = initial_state_dict[name]
    step = 0

In [0]:
# ANCHOR Print table of zeros and non-zeros count
def print_nonzeros(model):
    nonzero = total = 0
    for name, p in model.named_parameters():
        tensor = p.data.cpu().numpy()
        nz_count = np.count_nonzero(tensor)
        total_params = np.prod(tensor.shape)
        nonzero += nz_count
        total += total_params
        print(f'{name:20} | nonzeros = {nz_count:7} / {total_params:7} ({100 * nz_count / total_params:6.2f}%) | total_pruned = {total_params - nz_count :7} | shape = {tensor.shape}')
    print(f'alive: {nonzero}, pruned : {total - nonzero}, total: {total}, Compression rate : {total/nonzero:10.2f}x  ({100 * (total-nonzero) / total:6.2f}% pruned)')
    return (round(((total-nonzero)/total)*100,1))

## Choose dataset & model

In [13]:
# colab에서 돌리기 때문에 우선 arg가 아니라 변수로 넘겨주기
data_choice = "imdb" # 데이터셋 선택
arch_choice = "lstm" # 모델 선택
batch_size = 64 # batch size 선택 - 32 or 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} device.')

Using cuda device.


### load dataset

In [0]:
dataset = load_dataset(data_choice, arch_choice, batch_size, device, path)

### model setting

In [0]:
# check the trial number
trial = 9901

# pruning setting
iteration = 20
percent = 20 # 20% prune
N_EPOCH = 10
learning_rate = 5e-3

step = 0

## 1) random initialization

In [0]:
reinit = True # random이면 True, lt면 False
mode = 'random'

In [17]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# embedding 같은 것 주기 위해
embedding_pretrained_weight = model.embedding.weight

# model initialization and save the model
model.apply(initialize_xavier_normal)

LSTM(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [0]:
# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCEWithLogitsLoss()

criterion = criterion.to(device)

In [0]:
for name, param in model.named_parameters():
  print(name, param.size())

embedding.weight torch.Size([25002, 100])
rnn.weight_ih_l0 torch.Size([1024, 100])
rnn.weight_hh_l0 torch.Size([1024, 256])
rnn.bias_ih_l0 torch.Size([1024])
rnn.bias_hh_l0 torch.Size([1024])
rnn.weight_ih_l0_reverse torch.Size([1024, 100])
rnn.weight_hh_l0_reverse torch.Size([1024, 256])
rnn.bias_ih_l0_reverse torch.Size([1024])
rnn.bias_hh_l0_reverse torch.Size([1024])
rnn.weight_ih_l1 torch.Size([1024, 512])
rnn.weight_hh_l1 torch.Size([1024, 256])
rnn.bias_ih_l1 torch.Size([1024])
rnn.bias_hh_l1 torch.Size([1024])
rnn.weight_ih_l1_reverse torch.Size([1024, 512])
rnn.weight_hh_l1_reverse torch.Size([1024, 256])
rnn.bias_ih_l1_reverse torch.Size([1024])
rnn.bias_hh_l1_reverse torch.Size([1024])
fc.weight torch.Size([1, 512])
fc.bias torch.Size([1])


In [0]:
print(list(model.named_parameters()))

[('embedding.weight', Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7289, -0.7336,  1.5624,  ..., -0.5592, -0.4480, -0.6476],
        ...,
        [ 0.0914,  1.5196,  0.4670,  ...,  0.6393, -0.0332,  0.0185],
        [-0.6290,  0.4650, -0.7165,  ..., -1.3171,  2.0381, -2.0497],
        [-1.1222, -0.0240, -1.0878,  ..., -0.4948, -0.3874,  0.0339]],
       device='cuda:0', requires_grad=True)), ('rnn.weight_ih_l0', Parameter containing:
tensor([[-0.0007, -0.0342, -0.0056,  ...,  0.0398, -0.0260,  0.0581],
        [-0.0651, -0.0441,  0.0244,  ..., -0.0132, -0.0476, -0.0412],
        [-0.0146,  0.0274, -0.0125,  ..., -0.0522, -0.0511,  0.0084],
        ...,
        [ 0.0020,  0.0523,  0.0376,  ..., -0.1092, -0.0735,  0.0090],
        [ 0.0571,  0.0342, -0.0489,  ..., -0.0208,  0.0213, -0.0137],
        [-0.0753,  0.0224,  0.0323,  ...,  0.0165, -0.0369,  0.0058]],

In [0]:
print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # train 
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)

  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  print_nonzeros(model)

Iterative Pruning started
Running pruning iteration 0/20
0 0.662527102739253 0.5914228723404256
1 0.3716509703626024 0.8367686170212766
2 0.30710537533493754 0.87109375
3 0.3145540512781194 0.8773271276595744
4 0.2643658808888273 0.8996010638297872
5 0.36935136752559783 0.866688829787234
6 0.2801299524751115 0.8993517287234043
7 0.28347136072338897 0.9019281914893617
8 0.2799913494233438 0.8994348404255319
9 0.28661345952051753 0.9040059840425532
0.9014729299363057
embedding.weight     | nonzeros = 2500100 / 2500200 (100.00%) | total_pruned =     100 | shape = (25002, 100)
rnn.weight_ih_l0     | nonzeros =  102400 /  102400 (100.00%) | total_pruned =       0 | shape = (1024, 100)
rnn.weight_hh_l0     | nonzeros =  262144 /  262144 (100.00%) | total_pruned =       0 | shape = (1024, 256)
rnn.bias_ih_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 | shape = (1024,)
rnn.bias_hh_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 | shape = 

In [0]:
# random
print(performance)
print(valid_losses)

reinit_performance = performance
reinit_valid_losses = valid_losses

[90.14729299363057, 89.37101910828027, 88.51512738853503, 88.76393312101911, 88.34593949044586, 87.96775477707006, 88.14689490445859, 88.12699044585987, 88.05732484076432, 87.98765923566879, 86.91281847133759, 87.45023885350318, 87.54976114649682, 87.5, 86.75358280254777, 86.22611464968153, 85.718550955414, 85.80812101910828, 84.62380573248409, 84.156050955414, 83.23049363057325]
[0.28661345952051753, 0.41716557053571685, 0.45840316618535115, 0.7687182424867407, 0.6587125841290393, 0.7442224729568401, 0.7605741213452308, 0.7588291754747959, 0.7431053927445666, 0.620290422376166, 0.6481864425016844, 0.5886521193575351, 0.5730034785464089, 0.6326802684430112, 0.6607087347260181, 0.7203212646727867, 0.6760027712171383, 0.8521833796291909, 0.8220768962451752, 0.775856504097898, 0.7892035386188233]


## 2) lt

In [0]:
reinit = False # random이면 True, lt면 False
mode = 'lt'

In [0]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# model initialization and save the state
model.apply(initialize_xavier_normal)
initial_state_dict = copy.deepcopy(model.state_dict())

# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCEWithLogitsLoss()

criterion = criterion.to(device)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [0]:
for name, param in model.named_parameters():
  print(name, param.size())

embedding.weight torch.Size([25002, 100])
rnn.weight_ih_l0 torch.Size([1024, 100])
rnn.weight_hh_l0 torch.Size([1024, 256])
rnn.bias_ih_l0 torch.Size([1024])
rnn.bias_hh_l0 torch.Size([1024])
rnn.weight_ih_l0_reverse torch.Size([1024, 100])
rnn.weight_hh_l0_reverse torch.Size([1024, 256])
rnn.bias_ih_l0_reverse torch.Size([1024])
rnn.bias_hh_l0_reverse torch.Size([1024])
rnn.weight_ih_l1 torch.Size([1024, 512])
rnn.weight_hh_l1 torch.Size([1024, 256])
rnn.bias_ih_l1 torch.Size([1024])
rnn.bias_hh_l1 torch.Size([1024])
rnn.weight_ih_l1_reverse torch.Size([1024, 512])
rnn.weight_hh_l1_reverse torch.Size([1024, 256])
rnn.bias_ih_l1_reverse torch.Size([1024])
rnn.bias_hh_l1_reverse torch.Size([1024])
fc.weight torch.Size([1, 512])
fc.bias torch.Size([1])


In [0]:
print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # train 
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)

  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  print_nonzeros(model)

Iterative Pruning started
Running pruning iteration 0/20
0 0.6275445514536918 0.6356382978723404
1 0.29055116547549026 0.8817320478723404
2 0.2515067496198289 0.9027593085106383
3 0.2698628051563146 0.9006815159574468
4 0.29401789498614506 0.9046708776595744
5 0.25580672650261127 0.9069148936170213
6 0.2908192324511548 0.9043384308510638
7 0.25336884845920066 0.905751329787234
8 0.29126042993541096 0.9030086436170213
9 0.2709320707920384 0.9074135638297872
0.9060509554140127
embedding.weight     | nonzeros = 2500100 / 2500200 (100.00%) | total_pruned =     100 | shape = (25002, 100)
rnn.weight_ih_l0     | nonzeros =  102400 /  102400 (100.00%) | total_pruned =       0 | shape = (1024, 100)
rnn.weight_hh_l0     | nonzeros =  262144 /  262144 (100.00%) | total_pruned =       0 | shape = (1024, 256)
rnn.bias_ih_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 | shape = (1024,)
rnn.bias_hh_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 

In [0]:

# lt
print(performance)
print(valid_losses)
lt_performance = performance
lt_valid_losses = valid_losses

[90.60509554140127, 90.06767515923568, 89.04259554140127, 89.65963375796179, 89.46058917197452, 89.91839171974523, 90.07762738853503, 89.02269108280255, 88.80374203821655, 90.23686305732484, 89.70939490445859, 89.97810509554141, 89.83877388535032, 89.32125796178345, 89.30135350318471, 87.73885350318471, 88.87340764331209, 88.02746815286623, 87.2312898089172, 86.84315286624204, 86.33558917197452]
[0.2709320707920384, 0.3209210311042819, 0.355762850988577, 0.37667565362805383, 0.4281917932899074, 0.37324392522110583, 0.33273945213790906, 0.4613666643012077, 0.34915847057833316, 0.3047537452917784, 0.2992147037244223, 0.29310907639483824, 0.3200863677849795, 0.3248282623615988, 0.300750439055264, 0.3323654755157359, 0.3597956737899717, 0.35731010701745114, 0.3436133074078788, 0.3453334332939158, 0.35425657735384525]


## 3) lt+lr (last epoch)

In [0]:
reinit = False # random이면 True, lt면 False
mode = 'lr_l'

In [0]:
# load model and set hyperparameters
model = load_model(arch_choice, data_choice, batch_size)

# model initialization and save the state
model.apply(initialize_xavier_normal)

# initial state 저장
torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-initial.pt')

In [0]:
model = model.to(device)
make_mask(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCEWithLogitsLoss()

criterion = criterion.to(device)

In [39]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [40]:
for name, param in model.named_parameters():
  print(name, param.size())

embedding.weight torch.Size([25002, 100])
rnn.weight_ih_l0 torch.Size([1024, 100])
rnn.weight_hh_l0 torch.Size([1024, 256])
rnn.bias_ih_l0 torch.Size([1024])
rnn.bias_hh_l0 torch.Size([1024])
rnn.weight_ih_l0_reverse torch.Size([1024, 100])
rnn.weight_hh_l0_reverse torch.Size([1024, 256])
rnn.bias_ih_l0_reverse torch.Size([1024])
rnn.bias_hh_l0_reverse torch.Size([1024])
rnn.weight_ih_l1 torch.Size([1024, 512])
rnn.weight_hh_l1 torch.Size([1024, 256])
rnn.bias_ih_l1 torch.Size([1024])
rnn.bias_hh_l1 torch.Size([1024])
rnn.weight_ih_l1_reverse torch.Size([1024, 512])
rnn.weight_hh_l1_reverse torch.Size([1024, 256])
rnn.bias_ih_l1_reverse torch.Size([1024])
rnn.bias_hh_l1_reverse torch.Size([1024])
fc.weight torch.Size([1, 512])
fc.bias torch.Size([1])


In [41]:
print("Iterative Pruning started")

performance = [] # 결과값을 담을 리스트
valid_losses = []

for pruning_iter in range(0,iteration+1):
  print(f"Running pruning iteration {pruning_iter}/{iteration}")

  # 첫 iter에는 no model compression
  if not pruning_iter == 0:

    # pruning
    prune_by_percentile(percent)

    # random initialization
    if reinit:
      model.apply(initialize_xavier_normal) # random initialization
      model.embedding.weight = embedding_pretrained_weight # embedding은 공통적으로 초기 임베딩으로 초기화
    
      step = 0
      for name, param in model.named_parameters():
      # if 'weight' in name: 
         weight_dev = param.device
         param.data = torch.from_numpy(param.data.cpu().numpy() * mask[step]).to(weight_dev)
         step = step + 1
      step = 0
    
    # lt initialization
    else:
      original_initialization(mask, initial_state_dict)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # train  
  for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, dataset.train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataset.valid_iter, criterion)
    print(epoch, valid_loss, valid_acc)

  test_loss, test_acc = evaluate(model, dataset.test_iter, criterion)
  torch.save(model.state_dict(), f'{data_choice}-{arch_choice}-{trial}-{mode}-{pruning_iter}.pt')
  print(test_acc)

  valid_losses.append(valid_loss)
  performance.append(100*test_acc)

  # late rewinding after the last epoch
  if pruning_iter == 0:
     initial_state_dict = copy.deepcopy(model.state_dict())

  print_nonzeros(model)

Iterative Pruning started
Running pruning iteration 0/20
0 0.6665587637652742 0.586685505319149
1 0.6272801439812843 0.6527593085106383
2 0.3273678425144642 0.863031914893617
3 0.28569440496094684 0.882563164893617
4 0.34947340483678146 0.8648603723404256
5 0.2674269433668319 0.8961103723404256
6 0.29116612423132077 0.8877992021276596
7 0.25458272473212884 0.9029255319148937
8 0.25717981492585323 0.897689494680851
9 0.2673580652261351 0.9003490691489362
0.8995820063694268
embedding.weight     | nonzeros = 2500100 / 2500200 (100.00%) | total_pruned =     100 | shape = (25002, 100)
rnn.weight_ih_l0     | nonzeros =  102400 /  102400 (100.00%) | total_pruned =       0 | shape = (1024, 100)
rnn.weight_hh_l0     | nonzeros =  262144 /  262144 (100.00%) | total_pruned =       0 | shape = (1024, 256)
rnn.bias_ih_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 | shape = (1024,)
rnn.bias_hh_l0       | nonzeros =    1024 /    1024 (100.00%) | total_pruned =       0 | s

In [42]:
# lr_l
print(performance)
print(valid_losses)

lr_l_performance = performance
lr_l_valid_losses = valid_losses

[89.95820063694268, 89.98805732484077, 90.25676751592357, 89.77906050955414, 90.38614649681529, 89.92834394904459, 89.74920382165605, 90.07762738853503, 90.43590764331209, 90.48566878980891, 89.7890127388535, 89.84872611464968, 89.42078025477707, 89.25159235668791, 89.85867834394905, 88.58479299363057, 89.55015923566879, 88.4952229299363, 87.99761146496814, 86.83320063694268, 87.2312898089172]
[0.2673580652261351, 0.3663525728984399, 0.3904006472848197, 0.3048384489610474, 0.3555393433198333, 0.33361136421878285, 0.3963029908531524, 0.3866792597035144, 0.41103954054415226, 0.37622772929991816, 0.40375970010744766, 0.40714911014792765, 0.40477936673275333, 0.41857303094435877, 0.3695231836606213, 0.4643802063024424, 0.40817442465018716, 0.4097749140272115, 0.4255295708934043, 0.47524133705078286, 0.3880528197722866]


# Plot

In [43]:
print(reinit_performance)
print(lt_performance)
print(lr_l_performance)

NameError: ignored