## Imports

In [2]:
import numpy as np
import pandas as pd
import jsonlines
import gc
import torch
from torch import nn

from IPython.display import display, clear_output

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
from transformers.activations import GELUActivation
from transformers.modeling_outputs import MaskedLMOutput
from transformers import DataCollatorForWholeWordMask
from datasets import load_from_disk, load_dataset
from transformers import BertTokenizer, DistilBertTokenizer
import evaluate

import wandb
# wandb.init(project="kg-lm-integration", entity="tanny411")

from huggingface_hub import notebook_login
# notebook_login()

emb_tsv_file = "wikidata_translation_v1.tsv"

In [3]:
from transformers.data.data_collator import _torch_collate_batch

In [4]:
# torch.cuda.is_available(), torch.cuda.device_count(), torch.cuda.current_device(), torch.cuda.get_device_name(0)

In [5]:
bert_model_name = "distilbert-base-uncased" ##"bert-base-cased"

# Initialize Project

In [6]:
from huggingface_hub import notebook_login

notebook_login()

#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

Login successful
Your token has been saved to /home/xiangru/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


Download:
- embeds_wktxt.csv
- [linked-wikitext-2 dataset](https://rloganiv.github.io/linked-wikitext-2/#/) and unzip

# Tokenization

- `tokens` are the given list of tokens from wikitext2
- `input_ids` are what come from tokenization, they divide certain words into multiple pieces, and each sentence has a CLS and a SEP
- `word_tokens` is the length of `tokens`. For each token in `token`, it mentions how many sub-words it was divided into due to word piece tokenization
- `cummulative_word_tokens` is a cummulative sum of `word_tokens`, with an extra 0 in the beginning

##### Process
index of a token in `token` can be found in `input_ids` by `cummulative_word_tokens`. if `ix` is the index of a word in `token`, its beginning index in `input_ids` is `cummulative_word_tokens[ix] + 1`, the +1 is because `input_ids` has a CLS in the beginning. `token[ix]` spans from `input_ids[cummulative_word_tokens[ix] + 1]` to `input_ids[cummulative_word_tokens[ix+1] + 1]`

In [7]:
embeds_wktxt = pd.read_csv("embeds_wktxt.csv")

linked_wikitext_2 = "linked-wikitext-2/"
train = linked_wikitext_2+"train.jsonl"
valid = linked_wikitext_2+"valid.jsonl"
test = linked_wikitext_2+"test.jsonl"

data_files = {"train": train, "valid": valid, "test": test}
wikitest2_dataset = load_dataset("json", data_files=data_files)

chunk_size = 128

class BertTokenizerModified(DistilBertTokenizer): #BertTokenizer
    def __init__(self,vocab_file,**kwargs):
        
        super().__init__(vocab_file, never_split=["@@START@@", "@@END@@", "@@start@@", "@@end@@"], **kwargs)
    
        self.tokenized_list = []

    def _tokenize(self, text):
        token_list = text.split()
        split_tokens = []
        tokenized_list = []
        
        if self.do_basic_tokenize:
            for token in token_list:

                # If the token is part of the never_split set
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                    tokenized_list.append(1)
                else:
                    word_tokenized = self.wordpiece_tokenizer.tokenize(token)
                    split_tokens += word_tokenized
                    tokenized_list.append(len(word_tokenized))

        self.tokenized_list.append(tokenized_list)
        return split_tokens
    
def get_cumm(vals):
    cumm = 0
    res = [0] ## len of res is 1 more than vals, with an initial 0
    for val in vals:
        cumm += val
        res.append(cumm)
    return res

def my_tokenize_function(data):
    
    ## tokenize
    my_tokenizer.tokenized_list = []
    result = my_tokenizer([" ".join(eg) for eg in data["tokens"]])
    if my_tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    
    ## save word to token mapping
    ## 3, 1, 1 means the first word got divided into 3 tokens, the next into 1, and the next into 1 again
    result["word_tokens"] = my_tokenizer.tokenized_list
    result["cummulative_word_tokens"] = [get_cumm(x) for x in result["word_tokens"]]
    
    return result

def get_kg_embedding_batched(data):
    
    ## store a masking array that says whether or not an item has kg embedding
    """
    When you specify batched=True the function receives a dictionary with the fields of the dataset, 
    but each value is now a list of values, and not just a single value. 
    """
    input_ids_list = data["input_ids"]
    annotations_list = data['annotations']
    cummulative_word_tokens_list = data["cummulative_word_tokens"]
    
    batch_size = len(input_ids_list)
    embed_list = [] ## len will be batch_size
    embed_mask = []
    
    for i in range(batch_size):
        input_ids = input_ids_list[i]
        annotations = annotations_list[i]
        
        ## Replace zeros with random numbers if required
        embeds = np.zeros((len(input_ids), 200)) ## CLS, SEP will have np.zeros, like unknown words
        mask = [0]*len(input_ids)
        
        for annot in annotations:
            start_ix, end_ix = annot['span']
            start = cummulative_word_tokens_list[i][start_ix] + 1
            end = cummulative_word_tokens_list[i][end_ix] + 1
            
            qid = annot['id']
            df = embeds_wktxt[embeds_wktxt['id']==qid]
            if len(df)>0:
                embeds[start:end] = np.tile(df.iloc[0,1:].values.reshape((1,200)),(end-start, 1))
                mask[start:end] = [1]*(end-start)
                
        embed_mask.append(mask)
        embed_list.append(embeds)

    return {"kg_embedding": embed_list, "kg_embedding_mask": embed_mask}

def filter_text_batched(data):
    input_ids_list = data["input_ids"]
    kg_embeds_list = data['kg_embedding']
    
    indices_list = [[i for i,input_id in enumerate(input_ids) if input_id!=100]
                        for input_ids in input_ids_list]
    new_input_ids_list = []
    new_kg_embedding_list = []
        
    for indices, input_ids, kg_embeds in zip(indices_list, input_ids_list, kg_embeds_list):
        
        new_input_ids = [input_ids[ind] for ind in indices]
        new_kg_embeds = [kg_embeds[ind] for ind in indices]
        
        new_input_ids_list.append(new_input_ids)
        new_kg_embedding_list.append(new_kg_embeds)
        
    return {"input_ids": new_input_ids_list, "kg_embedding": new_kg_embedding_list}

def truncate_data(data):
    maxlength = my_tokenizer.max_model_input_sizes[bert_model_name]

    ## truncate to maxlength
    data["input_ids"] = [x[:maxlength] for x in data["input_ids"]]
    data["attention_mask"] = [x[:maxlength] for x in data["attention_mask"]]
    data["kg_embedding"] = [x[:maxlength] for x in data["kg_embedding"]]
    data["kg_embedding_mask"] = [x[:maxlength] for x in data["kg_embedding_mask"]]
    
    return {
                "input_ids": data["input_ids"], 
                "kg_embedding":  data["kg_embedding"],
                "attention_mask": data["attention_mask"],
                "kg_embedding_mask": data["kg_embedding_mask"],
           }


def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    
    # Create a new labels column
#     result["labels"] = result["input_ids"].copy()
    return result

Using custom data configuration default-11b78ea2993030be
Found cached dataset json (/home/xiangru/.cache/huggingface/datasets/json/default-11b78ea2993030be/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
my_tokenizer = BertTokenizerModified.from_pretrained(bert_model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizerModified'.


In [9]:
# filtered_tokenized_kg_wikitest2_dataset_file = "filtered_tokenized_kg_wikitest2_dataset_file"
# filtered_tokenized_kg_wikitest2_dataset = wikitest2_dataset.map(my_tokenize_function, batched=True)\
#                                                   .map(get_kg_embedding_batched, 
#                                                          batched=True, 
#                                                          batch_size=100, 
#                                                          keep_in_memory=False)\
#                                                   .map(filter_text_batched, 
#                                                          batched=True, 
#                                                          batch_size=100, 
#                                                          keep_in_memory=False)\
#                                                   .map(truncate_data,
#                                                         batched=True, 
#                                                         batch_size=100, 
#                                                         keep_in_memory=False)

# filtered_tokenized_kg_wikitest2_dataset.save_to_disk(filtered_tokenized_kg_wikitest2_dataset_file)

In [10]:
dataset_file = "concat_dataset"

In [11]:
final_dataset = wikitest2_dataset.map(my_tokenize_function, batched=True)\
                          .map(get_kg_embedding_batched, batched=True, batch_size=100, keep_in_memory=False)\
                          .map(filter_text_batched, batched=True, batch_size=100, keep_in_memory=False)\
                          .remove_columns(['title', 'tokens', 'annotations', 'word_tokens', 'cummulative_word_tokens'])\
                          .map(group_texts, batched=True, batch_size=100, keep_in_memory=False)\

final_dataset.save_to_disk(dataset_file)

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3327 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
## Load the saved tokenized dataset
final_dataset = load_from_disk(dataset_file)

In [97]:
## Create Data Collator for Masking

from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
from collections.abc import Mapping

class CustomDataCollator(DataCollatorForWholeWordMask):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        if isinstance(examples[0], Mapping):
            input_ids = [e["input_ids"] for e in examples]
            kg_embedding_mask = [e["kg_embedding_mask"] for e in examples]
            kg_embedding = [e["kg_embedding"] for e in examples]    #new
            
        else:
            raise Exception("Dataset needs to be in dictionary format")

        batch_input = _torch_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)

        mask_labels = kg_embedding_mask
        batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
        inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
        
        #return {"input_ids": inputs, "labels": labels}
        return {"input_ids": inputs, "labels": labels, "kg_embedding":kg_embedding, "kg_embedding_mask":kg_embedding_mask}          #new
    

In [98]:
#final_dataset['train']['kg_embedding_mask'][0]

# Integrator

In [159]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class WordItgtor(nn.Module):
    
    # seems like the dimension of the language model embedding will be 768 while that for 
    # knowledge graph embedding is 200, the module is kind of built based on this fact that 
    # the former has larger dimension
    def __init__(self, embed_dim_lm, embed_dim_kg):
        super(WordItgtor, self).__init__()
        
        self.tt_embed_dim = embed_dim_lm, embed_dim_kg
        
        self.fc_kg = nn.Linear(embed_dim_kg,embed_dim_lm)
        self.fc_lm = nn.Linear(embed_dim_lm,embed_dim_lm)
        self.fc1 = nn.Linear(embed_dim_lm * 2, embed_dim_lm * 4)
        self.fc2 = nn.Linear(embed_dim_lm * 4, embed_dim_lm * 2)
        self.fc3 = nn.Linear(embed_dim_lm * 2, embed_dim_lm)
        

    
    
    def forward(self, x_lm, x_kg, kg_mask=None):        
        # x_lm_raw is the embedding of a single sentence from language model with size 
        # ((number of words) * embed_dim_lm)
        
        # x_kg is the embedding of a single sentence from knowledge graph with size 
        # ((number of words) * embed_dim_lm)
        
#         if kg_mask == 0:
#             return x_lm
        
        x_kg = self.fc_kg(x_kg)
        x_lm = self.fc_lm(x_lm)
        
        x = torch.cat((x_lm,x_kg),dim=-1)
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

In [160]:
net = WordItgtor(768,200)

# Create Model

In [161]:
class BERTModified(nn.Module):
    def __init__(self, bert_model_name):
        super().__init__()

        self.base_model = AutoModel.from_pretrained(bert_model_name)
        self.config = self.base_model.config
        
        self.activation = GELUActivation() # for distilbert
        self.vocab_transform = nn.Linear(self.config.dim, self.config.dim)
        self.vocab_layer_norm = nn.LayerNorm(self.config.dim, eps=1e-12)
        self.vocab_projector = nn.Linear(self.config.dim, self.config.vocab_size)

        self.mlm_loss_fct = nn.CrossEntropyLoss()
        
        ## set to eval
        self.base_model.eval()
        
        ## freeze model
        for param in self.base_model.parameters():
            param.requires_grad = False
            
        ## initialization of integrator
        self.itgt = WordItgtor(self.config.dim,200)

    def forward(
        self,
        kg_embedding = None,           ## given
        kg_embedding_mask = None,      ## given
        input_ids = None,              ## given
        attention_mask = None,         ## given
        head_mask = None,
        inputs_embeds = None,
        labels = None,                 ## given
        output_attentions = None,
        output_hidden_states = None,
        return_dict= None,):
        
        base_model_output = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        ## Get LM embedding
        hidden_states = base_model_output[0]  # (bs, seq_length, dim)
        
        ## TODO: Use hidden_states and kg_embedding and perform INTEGRATION
        #prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
        
        #print(kg_embedding)
        kg_embedding = torch.tensor(kg_embedding).to(device='cuda:0')
        
        prediction_logits = self.itgt(hidden_states, kg_embedding)
        
        prediction_logits = self.activation(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
        
        
        
        
        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)

        mlm_loss = None
        if labels is not None:
            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))

        return MaskedLMOutput(
            loss=mlm_loss,
            logits=prediction_logits,
            hidden_states=base_model_output.hidden_states,
            attentions=base_model_output.attentions,
        )

In [162]:
BERTModified_model = BERTModified(bert_model_name)

loading configuration file config.json from cache at /home/xiangru/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/xiangru/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were 

In [188]:
data_collator = CustomDataCollator(tokenizer=my_tokenizer, mlm=True, mlm_probability=0.15)
# DataCollatorForWholeWordMask(tokenizer=my_tokenizer, mlm=True, mlm_probability=0.15)

In [189]:
# # To test model with smaller sample dataset

# train_size = 1000
# test_size = 100

# downsampled_dataset = final_dataset["train"].train_test_split(train_size=train_size, test_size=test_size, seed=42)
# downsampled_dataset

In [190]:
metric_name = evaluate.load("perplexity") # accuracy

In [194]:
from transformers import TrainingArguments

batch_size = 16

# Show the training loss with every epoch
logging_steps = len(final_dataset['train']) // batch_size
model_name = "BERTModified"
output_dir = f"{model_name}-finetuned-wikitext-test"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
#     fp16=True,
    logging_steps=logging_steps,
    num_train_epochs=5,
#     load_best_model_at_end=True,
#     metric_for_best_model="loss",#metric_name,
#     greater_is_better = False,
    logging_dir='logs',
    #report_to="wandb",
#     no_cuda=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


metric_for_best_model (str, optional) — Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models. Must be the name of a metric returned by the evaluation with or without the prefix "eval_". Will default to "loss" if unspecified and load_best_model_at_end=True (to use the evaluation loss).

If you set this value, greater_is_better will default to True. Don’t forget to set it to False if your metric is better when lower.

In [195]:
from transformers import Trainer

trainer = Trainer(
    model=BERTModified_model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["valid"],
    data_collator=data_collator,
)

/home/xiangru/KGLM/KG-LM-Integration/BERTModified-finetuned-wikitext-test is already a clone of https://huggingface.co/HideOnBush/BERTModified-finetuned-wikitext-test. Make sure you pull the latest changes with `repo.git_pull()`.


In [196]:
# samples = [trainer.eval_dataset[i] for i in range(2)]
# dc = data_collator(samples)
# kg_embedding_mask = [sample['kg_embedding_mask'] for sample in samples]

# for id_list, label_list, mask_list in zip(dc["input_ids"], dc["labels"], kg_embedding_mask):
#     tokens = my_tokenizer.convert_ids_to_tokens(id_list)
#     labels = my_tokenizer.convert_ids_to_tokens(label_list)
#     for token, label, mask in zip(tokens, labels, mask_list):
#         print(token, '\t', label, '\t', mask)
#     break

In [197]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 1798
  Batch size = 16


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


>>> Perplexity: 725.76


In [198]:
# wandb key:
#50852f0a580f4b3a3fcc15b5e605a0eac243351d

In [199]:
trainer.train()
# trainer.save_model("output/models/BERTModified")

***** Running training *****
  Num examples = 17528
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5480
  Number of trainable parameters = 35430714


Epoch,Training Loss,Validation Loss
1,6.1831,6.1345
2,5.7259,5.920311
3,5.4691,5.800306
4,5.3219,5.745775
5,5.2342,5.707208


***** Running Evaluation *****
  Num examples = 1798
  Batch size = 16
Saving model checkpoint to BERTModified-finetuned-wikitext-test/checkpoint-1096
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1798
  Batch size = 16
Saving model checkpoint to BERTModified-finetuned-wikitext-test/checkpoint-2192
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1798
  Batch size = 16
Saving model checkpoint to BERTModified-finetuned-wikitext-test/checkpoint-3288
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1798
  Batch size = 16
Saving model checkpoint to BERTModified-finetuned-wikitext-test/checkpoint-4384
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1798
  Batch size = 16
Saving model checkpoint to BERTModified-finet

TrainOutput(global_step=5480, training_loss=5.5864850743843695, metrics={'train_runtime': 4062.3415, 'train_samples_per_second': 21.574, 'train_steps_per_second': 1.349, 'total_flos': 0.0, 'train_loss': 5.5864850743843695, 'epoch': 5.0})

In [201]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 1798
  Batch size = 16


>>> Perplexity: 303.41


In [200]:
## Need to login to huggingface to push to hub

trainer.push_to_hub()

Saving model checkpoint to BERTModified-finetuned-wikitext-test
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/388M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/HideOnBush/BERTModified-finetuned-wikitext-test
   e8247fd..85338e3  main -> main



In [186]:
from transformers import pipeline

# Initialize MLM pipeline
mlm = pipeline('fill-mask', model=BERTModified_model.to(device='cuda:0'), tokenizer=my_tokenizer)

# Get mask token
mask = mlm.tokenizer.mask_token

# Get result for particular masked phrase
phrase = f'Wikipedia is a free online {mask}, created and edited by volunteers around the world'


result = mlm(phrase)

# Print result
print(result)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

In [35]:
mlm

<transformers.pipelines.fill_mask.FillMaskPipeline at 0x7fdea5a63128>

In [None]:
for x in result:
    print(f">>> {x['sequence']}")