Following some of the examples here: https://github.com/pytorch/fairseq/tree/master/examples/roberta

In [1]:
from fairseq.models.roberta import RobertaModel

In [2]:
roberta = RobertaModel.from_pretrained('/projects/deepgreen/pstjohn/roberta_base_checkpoint/',
                                       checkpoint_file='checkpoint_best.pt')
_ = roberta.eval()  # disable dropout (or leave in train mode to finetune)

In [3]:
# https://www.uniprot.org/uniprot/P14618.fasta
example_sequence = \
"""
MSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTGIICTIGPASRSVET
LKEMIKSGMNVARLNFSHGTHEYHAETIKNVRTATESFASDPILYRPVAVALDTKGPEIR
TGLIKGSGTAEVELKKGATLKITLDNAYMEKCDENILWLDYKNICKVVEVGSKIYVDDGL
ISLQVKQKGADFLVTEVENGGSLGSKKGVNLPGAAVDLPAVSEKDIQDLKFGVEQDVDMV
FASFIRKASDVHEVRKVLGEKGKNIKIISKIENHEGVRRFDEILEASDGIMVARGDLGIE
IPAEKVFLAQKMMIGRCNRAGKPVICATQMLESMIKKPRPTRAEGSDVANAVLDGADCIM
LSGETAKGDYPLEAVRMQHLIAREAEAAIYHLQLFEELRRLAPITSDPTEATAVGAVEAS
FKCCSGAIIVLTKSGRSAHQVARYRPRAPIIAVTRNPQTARQAHLYRGIFPVLCKDPVQE
AWAEDVDLRVNFAMNVGKARGFFKKGDVVIVLTGWRPGSGFTNTMRVVPVP
"""

Unlike the roberta example on fairseq, we're not using the GPT-2 byte-pair encoder, so the standard `roberta.encode` and `roberta.decode` methods won't work

In [4]:
def encode(sequence):
    input_sequence = ' '.join(sequence.replace('\n', ''))
    return roberta.task.source_dictionary.encode_line(input_sequence)

tokens = encode(example_sequence)
tokens

tensor([20,  8, 15, 14, 21,  8,  9,  5,  6, 11,  5, 17, 12, 16, 11, 16, 16,  4,
        21,  5,  5, 20,  5, 13, 11, 17,  4,  9, 21, 20, 23, 10,  4, 13, 12, 13,
         8, 14, 14, 12, 11,  5, 10, 18, 11,  6, 12, 12, 23, 11, 12,  6, 14,  5,
         8, 10,  8,  7,  9, 11,  4, 15,  9, 20, 12, 15,  8,  6, 20, 18,  7,  5,
        10,  4, 18, 17,  8, 21,  6, 11, 21,  9, 19, 21,  5,  9, 11, 12, 15, 18,
         7, 10, 11,  5, 11,  9,  8, 17,  5,  8, 13, 14, 12,  4, 19, 10, 14,  7,
         5,  7,  5,  4, 13, 11, 15,  6, 14,  9, 12, 10, 11,  6,  4, 12, 15,  6,
         8,  6, 11,  5,  9,  7,  9,  4, 15, 15,  6,  5, 11,  4, 15, 12, 11,  4,
        13, 18,  5, 19, 20,  9, 15, 23, 13,  9, 18, 12,  4, 22,  4, 13, 19, 15,
        18, 12, 23, 15,  7,  7,  9,  7,  6,  8, 15, 12, 19,  7, 13, 13,  6,  4,
        12,  8,  4, 16,  7, 15, 16, 15,  6,  5, 13, 17,  4,  7, 11,  9,  7,  9,
        18,  6,  6,  8,  4,  6,  8, 15, 15,  6,  7, 18,  4, 14,  6,  5,  5,  7,
        13,  4, 14,  5,  7,  8,  9, 15, 

In [5]:
import torch
if torch.cuda.is_available():
    print("Using the GPU")
    roberta.cuda()

with torch.no_grad():
    features = roberta.extract_features(tokens.to(torch.int64))

Using the GPU


In [20]:
import fairseq

In [25]:
args = {
    'user-dir': '/home/pstjohn/Research/20201119_fairseq/go_annotation/fairseq_layers',
    'restore-file': '/projects/deepgreen/pstjohn/roberta_base_checkpoint/checkpoint_best.pt',
    'max-positions': '512',
    'shorten-method': 'random_crop',
    'tokens-per-sample': '1024',
    'task': 'sentence_prediction',
    'reset-optimizer': True,
    'reset-dataloader': True,
    'reset-meters': True,
}

task = fairseq.tasks.setup_task(args)

AssertionError: Could not infer task type from {'user-dir': '/home/pstjohn/Research/20201119_fairseq/go_annotation/fairseq_layers', 'restore-file': '/projects/deepgreen/pstjohn/roberta_base_checkpoint/checkpoint_best.pt', 'max-positions': '512', 'shorten-method': 'random_crop', 'tokens-per-sample': '1024', 'task': 'sentence_prediction', 'reset-optimizer': True, 'reset-dataloader': True, 'reset-meters': True}

In [26]:
parser = fairseq.options.get_training_parser()

In [28]:
from fairseq.dataclass.utils import convert_namespace_to_omegaconf

In [40]:
convert_namespace_to_omegaconf(parser.parse_known_args())

{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'tensorboard_logdir': None, 'wandb_project': None, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': True}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_ba