# Save best model hyperparameters to a config file

In [1]:
import yaml
with open("config.yaml", "r") as yamlfile:
    model_configs = yaml.load(yamlfile, Loader=yaml.FullLoader)
    print("Read successful")

Read successful


In [2]:
############################################################################################### SST-2 ###################################################################

"""
ELECTRA: https://arxiv.org/pdf/2003.10555v1.pdf Table 7

Learning Rate	3e-4 for Small, 1e-4 for Base, 5e-5 for Large
Adam ϵ	1e-6
Adam β1	0.9
Adam β2	0.999
Layerwise LR decay	0.8 for Base/Small, 0.9 for Large
Learning rate decay	Linear
Warmup fraction	0.1
Attention Dropout	0.1
Dropout	0.1
Weight Decay	0
Batch Size	32
Train Epochs	10 for RTE and STS, 2 for SQuAD, 3 for other tasks
"""


model_configs['electra'] = {'sst-2' : {
                                        "learning_rate": 1e-4,  
                                        "batch_size": 32, 
                                        "architecture": 'electra',
                                        "dataset": 'sst-2',
                                        "epochs": 3, 
                                        "random_seed": 42,
                                        "adam_eps" : 1e-6, 
                                        "adam_b1" : 0.9,
                                        "adam_b2" : 0.999,
                                        "llrd" : 0.8,
                                        "decay_type": 'linear',
                                        "warmup_frac" : 0.1,
                                        "attn_dropout" : 0.1,
                                        "dropout" : 0.1,
                                        "weight_decay" : 0.0
                                        }
}


# https://huggingface.co/gchhablani/bert-base-cased-finetuned-sst2
"""The following hyperparameters were used during training:
- learning_rate: 2e-05
- train_batch_size: 16
- eval_batch_size: 8
- seed: 42
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
- lr_scheduler_type: linear
- num_epochs: 3.0"""

model_configs['bert'] = {'sst-2' : {
                        "learning_rate": 2e-5,  
                        "batch_size": 16, 
                        "architecture": 'bert',
                        "dataset": 'sst-2',
                        "epochs": 3, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0, # DEFAULT
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.0 # DEFAULT
}
}


"""
No official recommendations for roberta-base for SST-2, therefore following what is seen on this model card
https://huggingface.co/Bhumika/roberta-base-finetuned-sst2
"""
model_configs['roberta'] = {'sst-2' : {
                        "learning_rate": 2e-5,  
                        "batch_size": 16, 
                        "architecture": 'roberta',
                        "dataset": 'sst-2',
                        "epochs": 5, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0, # DEFAULT ## Change to 0.06 https://arxiv.org/pdf/1907.11692v1.pdf
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.0 # DEFAULT ## Change to 0.1 after article? https://arxiv.org/pdf/1907.11692v1.pdf
}}

"""
https://arxiv.org/pdf/1912.10165.pdf :
To train our model we follow a procedure largely based on the training procedures described in
Radford et al. (2019) with a few differences. All training is performed with a maximum sequence
length of 512 tokens. In the full dataset training setting we utilize a learning rate of 4 × 10−5
and
a batch size of 128. When training with a quarter of the dataset we then used a learning rate of
3 × 10−5
and a batch size of 32. Our learning rate has a warmup period over 1% of the total training
iterations before decaying according to a single cycle cosine decay schedule over 10 epochs. We
utilize an Adam optimizer (Kingma and Ba, 2014) with decoupled weight decay (Loshchilov and
Hutter, 2019) λ = 0.01. All our models are trained efficiently on V100 GPUs by utilizing mixed
precision training with dynamic loss scaling (Micikevicius et al., 2017). Additionally, we use global
gradient norm clipping of 1.0 to improve the stability of training large models. Lastly, we utilize
attention and hidden state dropout (Srivastava et al., 2014) values of 0.1.

"""

model_configs['gpt2-medium'] = {'sst-2' : {
                        "learning_rate": 4e-5,  
                        "batch_size": 128, 
                        "architecture": 'gpt2-medium',
                        "dataset": 'sst-2',
                        "epochs": 10, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'cosine',
                        "warmup_frac" : 0.01, 
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.1 
}
}

# from https://huggingface.co/tianyisun/opt-350m-finetuned-sst2

model_configs['opt'] = {'sst-2' : {
                        "learning_rate": 2e-5,  
                        "batch_size": 16, 
                        "architecture": 'opt',
                        "dataset": 'sst-2',
                        "epochs": 5, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0.0, 
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0. 
}
}

In [3]:
model_configs['roberta'] = {'sst-2' : {
                        "learning_rate": 2e-5,  
                        "batch_size": 16, 
                        "architecture": 'roberta',
                        "dataset": 'sst-2',
                        "epochs": 5, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0.06, # article https://arxiv.org/pdf/1907.11692v1.pdf
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.1 # article? https://arxiv.org/pdf/1907.11692v1.pdf
}}


In [4]:
############################################################################# SemEval ###################################################################################################

""""All values determined after raytuning on SemEval dataset with search space of: 

        "learning_rate": tune.loguniform(1e-6, 1e-4),
        "num_train_epochs": tune.choice(range(1, 10)),
        "seed": tune.choice(range(1, 41)),
        "per_device_train_batch_size": tune.choice([4, 8, 16, 32, 64]),

"""


# Slurm-212.out
# 'learning_rate': 3.4889766548903635e-06, 'num_train_epochs': 5, 'seed': 24, 'per_device_train_batch_size': 8 

model_configs['electra']['semeval'] = {
                                        "learning_rate": 3e-6,  
                                        "batch_size": 8, 
                                        "architecture": 'electra',
                                        "dataset": 'semeval',
                                        "epochs": 5, 
                                        "random_seed": 24,
                                        "adam_eps" : 1e-8, # DEFAULT
                                        "adam_b1" : 0.9,# DEFAULT
                                        "adam_b2" : 0.999,# DEFAULT
                                        "llrd" : None,
                                        "decay_type": 'linear',
                                        "warmup_frac" : 0 , # DEFAULT
                                        "attn_dropout" : 0.1, # DEFAULT
                                        "dropout" : 0.1, # DEFAULT
                                        "weight_decay" : 0.0 # DEFAULT
                                        }



# Slurm-210.out
# 'learning_rate': 1.3258269776216493e-05, 'num_train_epochs': 3, 'seed': 37, 'per_device_train_batch_size': 16

model_configs['bert']['semeval'] =  {
                        "learning_rate": 1e-5,  
                        "batch_size": 16, 
                        "architecture": 'bert',
                        "dataset": 'semeval',
                        "epochs": 3, 
                        "random_seed": 37,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0, # DEFAULT
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.0 # DEFAULT
}



#slurm-211.out
# {'learning_rate': 1.3258269776216493e-05, 'num_train_epochs': 3, 'seed': 37, 'per_device_train_batch_size': 16
model_configs['roberta']['semeval'] =  {
                        "learning_rate": 1e-5,  
                        "batch_size": 16, 
                        "architecture": 'roberta',
                        "dataset": 'semeval',
                        "epochs": 3, 
                        "random_seed": 37,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0, # DEFAULT
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.0 # DEFAULT
}


model_configs['gpt2-medium']['semeval'] =  {
                        "learning_rate": 8e-5,  
                        "batch_size": 32, 
                        "architecture": 'gpt2-medium',
                        "dataset": 'semeval',
                        "epochs": 7, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'cosine',
                        "warmup_frac" : 0.01, 
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.1 
}

# Best trial config: {'learning_rate': 6.8759316495854425e-06, 'epochs': 1, 'batch_size': 32, 'data': 'SemEval'}

model_configs['opt']['semeval'] =  {
                        "learning_rate": 7e-6,  
                        "batch_size": 32, 
                        "architecture": 'opt',
                        "dataset": 'semeval',
                        "epochs": 1, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'cosine',
                        "warmup_frac" : 0.01, 
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.1 
}

In [5]:
############################################################################# Hatexplain ###################################################################################################

""""All values determined after raytuning 25 trials on hatexplain dataset with search space of: 

        "learning_rate": tune.loguniform(1e-6, 1e-4),
        "num_train_epochs": tune.choice(range(1, 10)),
        "seed": tune.choice(range(1, 41)),
        "per_device_train_batch_size": tune.choice([4, 8, 16, 32, 64]),

"""


# BEST RUN CONFIGURATION:  {'learning_rate': 2.113705944064574e-05, 'num_train_epochs': 2, 'seed': 6, 'per_device_train_batch_size': 8}

model_configs['electra']['hatexplain'] = {
                                        "learning_rate": 2e-5,  
                                        "batch_size": 8, 
                                        "architecture": 'electra',
                                        "dataset": 'hatexplain',
                                        "epochs": 2, 
                                        "random_seed": 6,
                                        "adam_eps" : 1e-8, # DEFAULT
                                        "adam_b1" : 0.9,# DEFAULT
                                        "adam_b2" : 0.999,# DEFAULT
                                        "llrd" : None,
                                        "decay_type": 'linear',
                                        "warmup_frac" : 0 , # DEFAULT
                                        "attn_dropout" : 0.1, # DEFAULT
                                        "dropout" : 0.1, # DEFAULT
                                        "weight_decay" : 0.0 # DEFAULT
                                        }




#BEST RUN CONFIGURATION:  {'learning_rate': 2.00344279275607e-05, 'num_train_epochs': 5, 'seed': 2, 'per_device_train_batch_size': 32}

model_configs['bert']['hatexplain'] =  {
                        "learning_rate": 2e-5,  
                        "batch_size": 32, 
                        "architecture": 'bert',
                        "dataset": 'hatexplain',
                        "epochs": 5, 
                        "random_seed": 2,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0, # DEFAULT
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.0 # DEFAULT
}



# BEST RUN CONFIGURATION:  {'learning_rate': 5.899741796710488e-06, 'num_train_epochs': 5, 'seed': 2, 'per_device_train_batch_size': 32}
model_configs['roberta']['hatexplain'] =  {
                        "learning_rate": 6e-6,  
                        "batch_size": 32, 
                        "architecture": 'roberta',
                        "dataset": 'hatexplain',
                        "epochs": 5, 
                        "random_seed": 2,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0, # DEFAULT
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.0 # DEFAULT
}


model_configs['gpt2-medium']['hatexplain'] =  {
                        "learning_rate": 5e-5,  
                        "batch_size": 32, 
                        "architecture": 'gpt2-medium',
                        "dataset": 'hatexplain',
                        "epochs": 6, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'cosine',
                        "warmup_frac" : 0.01, 
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.1 
}

# Best trial config: {'learning_rate': 9.316976782029757e-06, 'epochs': 1, 'batch_size': 8, 'data': 'hatexplain'}
# Best trial final validation accuracy: 0.6929460580912863

model_configs['opt']['hatexplain'] =  {
                        "learning_rate": 9e-6,  
                        "batch_size": 8, 
                        "architecture": 'opt',
                        "dataset": 'hatexplain',
                        "epochs": 1, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'cosine',
                        "warmup_frac" : 0.01, 
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.1 
}

In [6]:
# BEST RUN CONFIGURATION:  {'learning_rate': 9.780337016659403e-06, 'num_train_epochs': 3, 'seed': 35, 'per_device_train_batch_size': 8}

model_configs['bert']['esnli'] =  {
                        "learning_rate": 1e-5,  
                        "batch_size": 8, 
                        "architecture": 'bert',
                        "dataset": 'esnli',
                        "epochs": 3, 
                        "random_seed": 35,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0, # DEFAULT
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.0 # DEFAULT
}

# BEST RUN RESULT: 0.9157691526112579
# BEST RUN CONFIGURATION:  {'learning_rate': 1.1208547084229366e-05, 'num_train_epochs': 3, 'seed': 28, 'per_device_train_batch_size': 32}

model_configs['electra']['esnli'] =  {
                        "learning_rate": 1e-5,  
                        "batch_size": 32, 
                        "architecture": 'electra',
                        "dataset": 'esnli',
                        "epochs": 3, 
                        "random_seed": 28,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0, # DEFAULT
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.0 # DEFAULT
}

# BEST RUN RESULT: 0.9185124974598659
# BEST RUN CONFIGURATION:  {'learning_rate': 4.6496174473363295e-06, 'num_train_epochs': 4, 'seed': 24, 'per_device_train_batch_size': 16}

model_configs['roberta']['esnli'] =  {
                        "learning_rate": 5e-6,  
                        "batch_size": 16, 
                        "architecture": 'roberta',
                        "dataset": 'esnli',
                        "epochs": 4, 
                        "random_seed": 24,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'linear',
                        "warmup_frac" : 0, # DEFAULT
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.0 # DEFAULT
}

# Best trial config: {'learning_rate': 6.11332959997169e-05, 'epochs': 3, 'batch_size': 32, 'data': 'esnli'}
# Best trial final validation loss: 0.34371566041916995
# Best trial final validation accuracy: 0.8742334054834056

model_configs['gpt2-medium']['esnli'] =  {
                        "learning_rate": 6e-5,  
                        "batch_size": 32, 
                        "architecture": 'gpt2-medium',
                        "dataset": 'esnli',
                        "epochs": 3, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'cosine',
                        "warmup_frac" : 0.01, 
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.1 
}

#Current best trial: ea990_00003 with loss=0.27794001855458095 and params={'learning_rate': 2.0650169550742063e-06, 'epochs': 2, 'batch_size': 16, 'data': 'esnli'}

model_configs['opt']['esnli'] =  {
                        "learning_rate": 2e-6,  
                        "batch_size": 16, 
                        "architecture": 'opt',
                        "dataset": 'esnli',
                        "epochs": 2, 
                        "random_seed": 42,
                        "adam_eps" : 1e-8, # DEFAULT
                        "adam_b1" : 0.9, # DEFAULT
                        "adam_b2" : 0.999, # DEFAULT
                        "llrd" : None,
                        "decay_type": 'cosine',
                        "warmup_frac" : 0.01, 
                        "attn_dropout" : 0.1, # DEFAULT 
                        "dropout" : 0.1, # DEFAULT
                        "weight_decay" : 0.1 
}

In [7]:
model_configs['opt'].keys()

dict_keys(['sst-2', 'semeval', 'hatexplain', 'esnli'])

In [8]:

with open("config.yaml", 'w') as yamlfile:
    data = yaml.dump(model_configs, yamlfile)
    print("Write successful")


Write successful


In [9]:
model_configs['gpt2-medium'].keys()

dict_keys(['sst-2', 'semeval', 'hatexplain', 'esnli'])