### Import Dependencies

In [1]:
import numpy as np
import plotly
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
import plotly.offline as pyo
pio.templates.default='plotly_white'

import torch
import torch.nn as nn

import transformers 
from transformers import AutoModel
from transformers import AdamW
from transformers import (  
    get_constant_schedule, 
    get_constant_schedule_with_warmup, 
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    get_polynomial_decay_schedule_with_warmup
)

epochs = 10

### Model

In [2]:
class Net(nn.Module):
    def __init__(self, model_name):
        super(Net, self).__init__()
        self.roberta = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(768, 1)
    def forward(self, input_ids):
        outputs = self.roberta(input_ids)
        sequence_output = outputs[1]
        return self.classifier(sequence_output)

### Differential / Discriminative Learning Rate

#### Introduction
1. The intuitive idea of **Differential Learning Rate** is that the embeddings start out in the first layer and have no contextual information. As the embeddings move deeper into the network, they pick up more general and contextual information with each layer. As we approach the final layers, however, we start picking up information that is specific to Transformer’s pre-training tasks (e.g. RoBERTa's "Masked Language Model" (MLM) and "Next Sentence Prediction" (NSP). 


2. Thus we can fine-tune the layers with different learning rates i.e. lower learning rates in the early layers, slightly higher in middle layers and higher at the top layers. 


3. We can also train task specific layers with a completely different and much higher learning rate than the transformer model since it hasn't been trained before and needs to learn faster.



#### Strategies for setting Learning Rate

1. `Unified Learning Rate for Complete Model (s)` - In this strategy we will set a single learning rate (in this case - 5e-5) for the complete model which is usually done.

2. `Differential Learning Rate for Transformer and Task-Specic Layer (i)` - In this strategy we will have two different lr's. One unified for the complete transformer model (RoBERTa) and other one for the task-specific layer (Regressor).

3. `Differential Learning Rate for Transformer Layers and Task-Specific Layer (a)` - Here we will set different different lr's for different transformer layers as well. I have grouped layers 1-4, 4-8, 8-12 and set learning rates accordingly as dicussed above. Other than that, I also set different and higher learning rate for the regressor.

In [3]:
def get_optimizer_params(model, type='s'):
    # differential learning rate and weight decay
    param_optimizer = list(model.named_parameters())
    learning_rate = 5e-5
    no_decay = ['bias', 'gamma', 'beta']
    if type == 's':
        optimizer_parameters = filter(lambda x: x.requires_grad, model.parameters())
    elif type == 'i':
        optimizer_parameters = [
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0},
            {'params': [p for n, p in model.named_parameters() if "roberta" not in n],
             'lr': 1e-3,
             'weight_decay_rate':0.01}
        ]
    elif type == 'a':
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.01},
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.01, 'lr': learning_rate/2.6},
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.01, 'lr': learning_rate},
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.01, 'lr': learning_rate*2.6},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.0},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.0, 'lr': learning_rate/2.6},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.0, 'lr': learning_rate},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.0, 'lr': learning_rate*2.6},
            {'params': [p for n, p in model.named_parameters() if "roberta" not in n], 'lr':1e-3, "momentum" : 0.99},
        ]
    return optimizer_parameters

### Helper Function

This defines a default layout with `title` as the only parameter which can be changed. We will use this for layout for every graph.

In [4]:
def get_default_layout(title):
    font_style = 'Courier New'
    layout = {}
    layout['height'] = 400
    layout['width'] = 1200
    layout['template'] = 'plotly_white'
    layout['dragmode'] = 'zoom'
    layout['hovermode'] = 'x'
    layout['hoverlabel'] = {
        'font_size': 14,
        'font_family':font_style
    }
    layout['font'] = {
        'size':14,
        'family':font_style,
        'color':'rgb(128, 128, 128)'
    }
    layout['xaxis'] = {
        'title': 'Epochs',
        'showgrid': True,
        'type': 'linear',
        'categoryarray': None,
        'gridwidth': 1,
        'ticks': 'outside',
        'showline': True, 
        'showticklabels': True,
        'tickangle': 0,
        'tickmode': 'array'
    }
    layout['yaxis'] = {
        'title': 'Learning Rate',
        'exponentformat':'none',
        'showgrid': True,
        'type': 'linear',
        'categoryarray': None,
        'gridwidth': 1,
        'ticks': 'outside',
        'showline': True, 
        'showticklabels': True,
        'tickangle': 0,
        'tickmode': 'array'
    }
    layout['title'] = {
        'text':title,
        'x': 0.5,
        'y': 0.95,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {
            'family':font_style,
            'size':14,
            'color':'black'
        }
    }
    layout['showlegend'] = True
    layout['legend'] = {
        'x':0.1,
        'y':1.1,
        'orientation':'h',
        'itemclick': 'toggleothers',
        'font': {
            'family':font_style,
            'size':14,
            'color':'black'
        }
    }
    return go.Layout(layout)

### Constant Schedule

> Create a schedule with a constant learning rate, using the learning rate set in optimizer.

In [5]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule(optimizer)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])

trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Constant Schedule')
go.Figure(data=[trace], layout=layout)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [6]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule(optimizer)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Constant Schedule')
go.Figure(data=[trace1, trace2], layout=layout)

### Constant Schedule with Warmup

> Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate increases linearly between 0 and the initial lr set in the optimizer.



In [7]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=3)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])

trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Constant Schedule with Warmup')
go.Figure(data=[trace], layout=layout)

In [8]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=3)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Constant Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

### Cosine with Warmup
> Create a schedule with a learning rate that decreases following the values of the cosine function between the initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.

In [9]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Cosine Schedule with Warmup')
go.Figure(data=[trace], layout=layout)

In [10]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

In [11]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[1]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])
    learning_rates3.append(optimizer.param_groups[3]["lr"])
    learning_rates4.append(optimizer.param_groups[8]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 1-4',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 5-8',
    marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates3, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 9-12',
    marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates4, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)

### Cosine With Hard Restarts
Create a schedule with a learning rate that decreases following the values of the cosine function between the initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.

In [12]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, num_cycles=5)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Cosine Schedule with Hard Restarts with Warmup')
go.Figure(data=[trace], layout=layout)

In [13]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, num_cycles=5)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Hard Restarts with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

In [14]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, num_cycles=5)

learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[1]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])
    learning_rates3.append(optimizer.param_groups[3]["lr"])
    learning_rates4.append(optimizer.param_groups[8]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 1-4',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 5-8',
    marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates3, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 9-12',
    marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates4, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Hard Restarts with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)

### Linear Schedule with Warmup
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.

In [15]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Linear Schedule with Warmup')
go.Figure(data=[trace], layout=layout)

In [16]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Linear Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

In [17]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[1]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])
    learning_rates3.append(optimizer.param_groups[3]["lr"])
    learning_rates4.append(optimizer.param_groups[8]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 1-4',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 5-8',
    marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates3, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 9-12',
    marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates4, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Linear Schedule with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)

## Polynomial Decay with Warmup
Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the optimizer to end lr defined by lr_end, after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.

In [18]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, power=2)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])

trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Polynomial Decay Schedule with Warmup')
go.Figure(data=[trace], layout=layout)

In [19]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, power=2)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Polynomial Decay Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

In [20]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, power=2)

learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[1]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])
    learning_rates3.append(optimizer.param_groups[3]["lr"])
    learning_rates4.append(optimizer.param_groups[8]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 1-4',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 5-8',
    marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates3, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 9-12',
    marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates4, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Polynomial Decay Schedule with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)