## Imports

In [1]:
import numpy as np
import pandas as pd
import jsonlines
import gc
import torch
from torch import nn

from IPython.display import display, clear_output

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
from transformers.activations import GELUActivation
from transformers.modeling_outputs import MaskedLMOutput
from transformers import DataCollatorForWholeWordMask
from datasets import load_from_disk, load_dataset
from transformers import BertTokenizer, DistilBertTokenizer
from transformers.data.data_collator import _torch_collate_batch
import evaluate

import wandb
# wandb.init(project="kg-lm-integration", entity="tanny411")

from huggingface_hub import notebook_login
notebook_login()
#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
emb_tsv_file = "wikidata_translation_v1.tsv"

Login successful
Your token has been saved to /home/xiangru/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [2]:
import sys
print(sys.getrecursionlimit())

sys.setrecursionlimit(100000)
# but doing so is dangerous -- the standard limit is a little conservative, but Python stackframes can be quite big.

3000


In [3]:
# torch.cuda.is_available(), torch.cuda.device_count(), torch.cuda.current_device(), torch.cuda.get_device_name(0)

In [4]:
bert_model_name = "distilbert-base-uncased" ##"bert-base-cased"

# Initialize Project

Download:
- embeds_wktxt.csv
- [linked-wikitext-2 dataset](https://rloganiv.github.io/linked-wikitext-2/#/) and unzip

# Tokenization

- `tokens` are the given list of tokens from wikitext2
- `input_ids` are what come from tokenization, they divide certain words into multiple pieces, and each sentence has a CLS and a SEP
- `word_tokens` is the length of `tokens`. For each token in `token`, it mentions how many sub-words it was divided into due to word piece tokenization
- `cummulative_word_tokens` is a cummulative sum of `word_tokens`, with an extra 0 in the beginning

##### Process
index of a token in `token` can be found in `input_ids` by `cummulative_word_tokens`. if `ix` is the index of a word in `token`, its beginning index in `input_ids` is `cummulative_word_tokens[ix] + 1`, the +1 is because `input_ids` has a CLS in the beginning. `token[ix]` spans from `input_ids[cummulative_word_tokens[ix] + 1]` to `input_ids[cummulative_word_tokens[ix+1] + 1]`

In [5]:
embeds_wktxt = pd.read_csv("embeds_wktxt.csv")
qids_wktxt = pd.read_csv("qids_wktxt2.csv")

linked_wikitext_2 = "linked-wikitext-2/"
train = linked_wikitext_2+"train.jsonl"
valid = linked_wikitext_2+"valid.jsonl"
test = linked_wikitext_2+"test.jsonl"

data_files = {"train": train, "valid": valid, "test": test}
wikitest2_dataset = load_dataset("json", data_files=data_files)

chunk_size = 128

class BertTokenizerModified(DistilBertTokenizer): #BertTokenizer
    def __init__(self,vocab_file,**kwargs):
        
        super().__init__(vocab_file, never_split=["@@START@@", "@@END@@", "@@start@@", "@@end@@"], **kwargs)
    
        self.tokenized_list = []

    def _tokenize(self, text):
        token_list = text.split()
        split_tokens = []
        tokenized_list = []
        
        if self.do_basic_tokenize:
            for token in token_list:

                # If the token is part of the never_split set
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                    tokenized_list.append(1)
                else:
                    word_tokenized = self.wordpiece_tokenizer.tokenize(token)
                    split_tokens += word_tokenized
                    tokenized_list.append(len(word_tokenized))

        self.tokenized_list.append(tokenized_list)
        return split_tokens
    
def get_cumm(vals):
    cumm = 0
    res = [0] ## len of res is 1 more than vals, with an initial 0
    for val in vals:
        cumm += val
        res.append(cumm)
    return res

def my_tokenize_function(data):
    
    ## tokenize
    my_tokenizer.tokenized_list = []
    result = my_tokenizer([" ".join(eg) for eg in data["tokens"]])
    if my_tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    
    ## save word to token mapping
    ## 3, 1, 1 means the first word got divided into 3 tokens, the next into 1, and the next into 1 again
    result["word_tokens"] = my_tokenizer.tokenized_list
    result["cummulative_word_tokens"] = [get_cumm(x) for x in result["word_tokens"]]
    
    return result

def get_kg_embedding_batched(data):
    
    ## store a masking array that says whether or not an item has kg embedding
    """
    When you specify batched=True the function receives a dictionary with the fields of the dataset, 
    but each value is now a list of values, and not just a single value. 
    """
    input_ids_list = data["input_ids"]
    annotations_list = data['annotations']
    cummulative_word_tokens_list = data["cummulative_word_tokens"]
    
    batch_size = len(input_ids_list)
    embed_list = [] ## len will be batch_size
    embed_mask = []
    embed_mask_qid = []
    
    #add by Edward, the index of qid
    embed_mask_index = []
    
    allc = 0
    cc = 0
    
    for i in range(batch_size):
        input_ids = input_ids_list[i]
        annotations = annotations_list[i]
        
        ## Replace zeros with random numbers if required
        embeds = np.zeros((len(input_ids), 200)) ## CLS, SEP will have np.zeros, like unknown words
        mask = [0]*len(input_ids)
        mask_qid = ['0']*len(input_ids)
        
        #add by Edward
        mask_index = [-100]*len(input_ids)
        
        
        for annot in annotations:
            start_ix, end_ix = annot['span']
            start = cummulative_word_tokens_list[i][start_ix] + 1
            end = cummulative_word_tokens_list[i][end_ix] + 1
            
            qid = annot['id']
            
            #add by Edward
            index_list = qids_wktxt[qids_wktxt["id"]==qid].index.tolist()
            allc += 1
            if len(index_list) == 0:
                qid_index = -100

            else:
                qid_index = index_list[0]
                cc+=1
            
            df = embeds_wktxt[embeds_wktxt['id']==qid]
            if len(df)>0:
                embeds[start:end] = np.tile(df.iloc[0,1:].values.reshape((1,200)),(end-start, 1))
                mask[start:end] = [1]*(end-start)
                mask_qid[start:end] = [qid]*(end-start)
                
                #add by Edward
                mask_index[start:end] = [qid_index]*(end-start)
                
                
        embed_mask.append(mask)
        embed_mask_qid.append(mask_qid)
        embed_list.append(embeds)
        
        #add by Edward
        embed_mask_index.append(mask_index)

    
    print(cc/allc)
    return {
        "kg_embedding": embed_list, 
        "kg_embedding_mask": embed_mask,
        "kg_embedding_mask_qid": embed_mask_qid,
        "kg_embedding_mask_index": embed_mask_index
    }

def filter_text_batched(data):
    
    new_data = {k:[] for k in data}
    
    input_ids_list = data["input_ids"]
    
    ## remove [UNK] == 100 
    indices_list = [[i for i,input_id in enumerate(input_ids) if input_id!=100]
                        for input_ids in input_ids_list]
    
    for k in data:
        for indices, data_list in zip(indices_list, data[k]):
            new_data[k].append([data_list[ind] for ind in indices])
        
    return new_data

def truncate_data(data):
    maxlength = my_tokenizer.max_model_input_sizes[bert_model_name]

    ## truncate to maxlength
    for k in data:
        data[k] = [x[:maxlength] for x in data[k]]
    
    return data


def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    
    # Create a new labels column
#     result["labels"] = result["input_ids"].copy()
    return result

Using custom data configuration default-0facda0eb7ba06ee
Found cached dataset json (/home/xiangru/.cache/huggingface/datasets/json/default-0facda0eb7ba06ee/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
my_tokenizer = BertTokenizerModified.from_pretrained(bert_model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizerModified'.


In [7]:
dataset_file = "concat_dataset_v2"


In [10]:
final_dataset = wikitest2_dataset.map(my_tokenize_function, batched=True)\
                          .map(get_kg_embedding_batched, batched=True, batch_size=100, keep_in_memory=False)\
                          .remove_columns(['title', 'tokens', 'annotations', 'word_tokens', 'cummulative_word_tokens'])\
                          .map(filter_text_batched, batched=True, batch_size=100, keep_in_memory=False)\
                          .map(group_texts, batched=True, batch_size=100, keep_in_memory=False)\

final_dataset.save_to_disk(dataset_file)

In [11]:
## Load the saved tokenized dataset
final_dataset = load_from_disk(dataset_file)

In [12]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'kg_embedding', 'kg_embedding_mask', 'kg_embedding_mask_qid', 'kg_embedding_mask_index'],
        num_rows: 17528
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'kg_embedding', 'kg_embedding_mask', 'kg_embedding_mask_qid', 'kg_embedding_mask_index'],
        num_rows: 1798
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'kg_embedding', 'kg_embedding_mask', 'kg_embedding_mask_qid', 'kg_embedding_mask_index'],
        num_rows: 2069
    })
})

In [13]:
## Create Data Collator for Masking

from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
from collections.abc import Mapping

class CustomDataCollator(DataCollatorForWholeWordMask):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        if isinstance(examples[0], Mapping):
            input_ids = [e["input_ids"] for e in examples]
            kg_embedding_mask = [e["kg_embedding_mask"] for e in examples]
            kg_embedding = [e["kg_embedding"] for e in examples]
            kg_embedding_mask_qid = [e["kg_embedding_mask_qid"] for e in examples]
            
            #add by Edward
            kg_embedding_mask_index = [e["kg_embedding_mask_index"] for e in examples]
        else:
            raise Exception("Dataset needs to be in dictionary format")

        batch_input = _torch_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)

        mask_labels = kg_embedding_mask
        batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
        inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
        
        return {
                "input_ids": inputs, 
                "labels": labels, 
                "kg_embedding":kg_embedding, 
                "kg_embedding_mask":kg_embedding_mask,
                "kg_embedding_mask_qid":kg_embedding_mask_qid,
                "kg_embedding_mask_index":kg_embedding_mask_index,
               }
    

# Integrator


In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class WordItgtor(nn.Module):
    
    # seems like the dimension of the language model embedding will be 768 while that for 
    # knowledge graph embedding is 200, the module is kind of built based on this fact that 
    # the former has larger dimension
    def __init__(self, embed_dim_lm, embed_dim_kg):
        super(WordItgtor, self).__init__()
        
        self.tt_embed_dim = embed_dim_lm, embed_dim_kg
        
        self.fc_kg = nn.Linear(embed_dim_kg,embed_dim_lm)
        self.fc_lm = nn.Linear(embed_dim_lm,embed_dim_lm)
        self.fc1 = nn.Linear(embed_dim_lm * 2, embed_dim_lm * 4)
        self.fc2 = nn.Linear(embed_dim_lm * 4, embed_dim_lm * 2)
        self.fc3 = nn.Linear(embed_dim_lm * 2, embed_dim_lm)
        

    
    
    def forward(self, x_lm, x_kg, kg_mask=None):        
        # x_lm_raw is the embedding of a single sentence from language model with size 
        # ((number of words) * embed_dim_lm)
        
        # x_kg is the embedding of a single sentence from knowledge graph with size 
        # ((number of words) * embed_dim_lm)
        
#         if kg_mask == 0:
#             return x_lm
        
        x_kg = self.fc_kg(x_kg)
        x_lm = self.fc_lm(x_lm)
        
        x = torch.cat((x_lm,x_kg),dim=-1)
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

# Create Model

In [40]:
from transformers import PreTrainedModel

class BERTModified(nn.Module): #PreTrainedModel
    def __init__(self, bert_model_name, base_model, config):
        
#         super().__init__(config) #For PreTrainedModel
        super().__init__() ## for nn.Module
        
        self.base_model = base_model
        self.config = config
        self.kg_size = qids_wktxt.shape[0]
                
        self.activation = GELUActivation() # for distilbert
        self.vocab_transform = nn.Linear(self.config.dim, self.config.dim)
        self.vocab_layer_norm = nn.LayerNorm(self.config.dim, eps=1e-12)
        self.vocab_projector = nn.Linear(self.config.dim, self.config.vocab_size)
        
        self.kg_projector = nn.Linear(self.config.dim, self.kg_size)
        
        self.mlm_loss_fct = nn.CrossEntropyLoss()
        
        ## set to eval
        self.base_model.eval()
        
        ## freeze model
        for param in self.base_model.parameters():
            param.requires_grad = False
            
        ## initialization of integrator
        self.itgt = WordItgtor(self.config.dim,200)

    def forward(
        self,
        kg_embedding = None,           ## given
        kg_embedding_mask = None,      ## given
        kg_embedding_mask_qid = None,      ## given
        kg_embedding_mask_index = None,
        input_ids = None,              ## given
        attention_mask = None,         ## given
        head_mask = None,
        inputs_embeds = None,
        labels = None,                 ## given
        output_attentions = None,
        output_hidden_states = None,
        return_dict= None,):
        
        base_model_output = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        ## Get LM embedding
        hidden_states = base_model_output[0]  # (bs, seq_length, dim)
        

        
        ## TODO: Use hidden_states and kg_embedding and perform INTEGRATION
#         prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
#         prediction_logits = self.activation(prediction_logits)  # (bs, seq_length, dim)
#         prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
#         prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)
        
        
        kg_embedding = torch.tensor(kg_embedding).to(device='cuda:0')
        kg_embedding_mask_index = torch.tensor(kg_embedding_mask_index).to(device='cuda:0')
        
        prediction_logits = self.itgt(hidden_states, kg_embedding)
        prediction_logits = self.activation(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)


    

    
        lm_prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)
        kg_prediction_logits = self.kg_projector(prediction_logits)  # (bs, seq_length, kg_size)

        mlm_loss = None
        kg_loss = None
        

        
        if labels is not None:
            mlm_loss = self.mlm_loss_fct(lm_prediction_logits.view(-1, lm_prediction_logits.size(-1)), labels.view(-1))
        if kg_embedding_mask_index is not None:
            kg_loss = self.mlm_loss_fct(kg_prediction_logits.view(-1, kg_prediction_logits.size(-1)), kg_embedding_mask_index.view(-1))

        total_loss = mlm_loss + kg_loss
        
        return MaskedLMOutput(
            loss=total_loss,
            logits=lm_prediction_logits,
            hidden_states=base_model_output.hidden_states,
            attentions=base_model_output.attentions,
        )

In [41]:
base_model = AutoModel.from_pretrained(bert_model_name)
BERTModified_model = BERTModified(bert_model_name = bert_model_name,
                                  base_model = base_model,
                                  config = base_model.config)

loading configuration file config.json from cache at /home/xiangru/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/xiangru/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were 

In [42]:
# BERTModified_model = AutoModel.from_pretrained("Aisha/BERTModified-finetuned-wikitext-test")

In [43]:
data_collator = CustomDataCollator(tokenizer=my_tokenizer, mlm=True, mlm_probability=0.15)
# DataCollatorForWholeWordMask(tokenizer=my_tokenizer, mlm=True, mlm_probability=0.15)

In [44]:
# To test model with smaller sample dataset

train_size = 1000
test_size = 10

downsampled_dataset = final_dataset["train"].train_test_split(train_size=train_size, test_size=test_size, seed=42)
downsampled_dataset

Loading cached split indices for dataset at concat_dataset_v4/train/cache-f6c579ef53c2b0e6.arrow and concat_dataset_v4/train/cache-f218f6d62d6d3c47.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'kg_embedding', 'kg_embedding_mask', 'kg_embedding_mask_qid', 'kg_embedding_mask_index'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'kg_embedding', 'kg_embedding_mask', 'kg_embedding_mask_qid', 'kg_embedding_mask_index'],
        num_rows: 10
    })
})

In [45]:
import evaluate

# metrics = evaluate.combine(["accuracy", "precision", "recall", "f1"])
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_preds=None, logits=None, labels=None):
    
    # We should have either `eval_preds` or both `logits` and `labels`
    if eval_preds:
        logits, labels = eval_preds

    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[l for l in label if l != -100] for label in labels]
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    ## Flatten values
    true_labels = [item for sublist in true_labels for item in sublist]
    true_predictions = [item for sublist in true_predictions for item in sublist]
    
    accuracy = accuracy_metric.compute(predictions=true_predictions, references=true_labels)
    precision = precision_metric.compute(predictions=true_predictions, references=true_labels, average="micro")
    recall = recall_metric.compute(predictions=true_predictions, references=true_labels, average="micro")
    f1 = f1_metric.compute(predictions=true_predictions, references=true_labels, average="micro")
    
    return {
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
        "accuracy": accuracy["accuracy"],
    }

In [46]:
from transformers import TrainingArguments

batch_size = 4

# Show the training loss with every epoch
logging_steps = len(downsampled_dataset['train']) // batch_size #len(final_dataset['train']) // batch_size
model_name = "BERTModified-rawbert"
output_dir = f"{model_name}-finetuned-wikitext-test"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
#     fp16=True,
    logging_steps=logging_steps,
    num_train_epochs=50,
#     load_best_model_at_end=True,
#     metric_for_best_model="loss",#metric_name,
#     greater_is_better = False,
    logging_dir='logs',
    report_to="wandb",
#     no_cuda=True,
)

PyTorch: setting up devices


metric_for_best_model (str, optional) — Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models. Must be the name of a metric returned by the evaluation with or without the prefix "eval_". Will default to "loss" if unspecified and load_best_model_at_end=True (to use the evaluation loss).

If you set this value, greater_is_better will default to True. Don’t forget to set it to False if your metric is better when lower.

In [47]:
from transformers import Trainer

trainer = Trainer(
    model=BERTModified_model,
    args=training_args,
    train_dataset=downsampled_dataset["train"], #final_dataset["train"],
    eval_dataset=downsampled_dataset["test"], #final_dataset["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/HideOnBush/BERTModified-rawbert-finetuned-wikitext-test into local empty directory.


In [48]:
# samples = [trainer.eval_dataset[i] for i in range(2)]
# dc = data_collator(samples)
# kg_embedding_mask = [sample['kg_embedding_mask'] for sample in samples]

# for id_list, label_list, mask_list in zip(dc["input_ids"], dc["labels"], kg_embedding_mask):
#     tokens = my_tokenizer.convert_ids_to_tokens(id_list)
#     labels = my_tokenizer.convert_ids_to_tokens(label_list)
#     for token, label, mask in zip(tokens, labels, mask_list):
#         print(token, '\t', label, '\t', mask)
#     break

## Train Model

In [49]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 10
  Batch size = 4


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


>>> Perplexity: 30289.83


In [50]:
eval_results

{'eval_loss': 10.318567276000977,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.0,
 'eval_runtime': 0.5117,
 'eval_samples_per_second': 19.541,
 'eval_steps_per_second': 5.862}

# TODO: Trainer.model is not a `PreTrainedModel`, only saving its state dict.

In [51]:
trainer.train()
#trainer.save_model("output/models/BERTModified")

***** Running training *****
  Num examples = 1000
  Num Epochs = 50
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 12500
  Number of trainable parameters = 71331479


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,10.1906,10.064298,0.019231,0.019231,0.019231,0.019231
2,9.6757,9.844508,0.025641,0.025641,0.025641,0.025641
3,9.2617,9.6789,0.038462,0.038462,0.038462,0.038462
4,8.9324,9.583185,0.032051,0.032051,0.032051,0.032051
5,8.6792,9.510508,0.032051,0.032051,0.032051,0.032051
6,8.4673,9.445129,0.032051,0.032051,0.032051,0.032051
7,8.2934,9.430649,0.032051,0.032051,0.032051,0.032051
8,8.1511,9.421928,0.025641,0.025641,0.025641,0.025641
9,8.0243,9.385569,0.032051,0.032051,0.032051,0.032051
10,7.9169,9.412608,0.032051,0.032051,0.032051,0.032051


***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to BERTModified-rawbert-finetuned-wikitext-test/checkpoint-250
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to BERTModified-rawbert-finetuned-wikitext-test/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to BERTModified-rawbert-finetuned-wikitext-test/checkpoint-750
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to BERTModified-rawbert-finetuned-wikitext-test/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to BERT

Saving model checkpoint to BERTModified-rawbert-finetuned-wikitext-test/checkpoint-9250
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to BERTModified-rawbert-finetuned-wikitext-test/checkpoint-9500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to BERTModified-rawbert-finetuned-wikitext-test/checkpoint-9750
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to BERTModified-rawbert-finetuned-wikitext-test/checkpoint-10000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 4
Saving model checkpoint to BERTModified-rawbert-finetuned-wikitext-test/checkpoint-10250
Traine

TrainOutput(global_step=12500, training_loss=7.328463642578125, metrics={'train_runtime': 2977.3358, 'train_samples_per_second': 16.794, 'train_steps_per_second': 4.198, 'total_flos': 0.0, 'train_loss': 7.328463642578125, 'epoch': 50.0})

In [32]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}") #21596 for 1 epoch

eval_results

***** Running Evaluation *****
  Num examples = 10
  Batch size = 4


>>> Perplexity: 161407174.64


{'eval_loss': 18.89944076538086,
 'eval_precision': 0.25,
 'eval_recall': 0.25,
 'eval_f1': 0.25,
 'eval_accuracy': 0.25,
 'eval_runtime': 0.5253,
 'eval_samples_per_second': 19.036,
 'eval_steps_per_second': 5.711,
 'epoch': 50.0}

In [None]:
## Need to login to huggingface to push to hub

trainer.push_to_hub()

## Test model

In [33]:
## errors in GPU (then pc cant find GPU anymore)
predictions = trainer.predict(downsampled_dataset["test"]) #final_dataset["valid"] cpu crashes
print(predictions.predictions.shape, predictions.label_ids.shape)

***** Running Prediction *****
  Num examples = 10
  Batch size = 4


(10, 128, 30522) (10, 128)


In [34]:
preds = np.argmax(predictions.predictions, axis=-1)
preds

array([[ 1011,  2009,  1005, ...,  2139,  2006,  2015],
       [ 2015,  1998,  2015, ...,  1997,  2455,  1998],
       [ 1998,  2009, 29625, ...,  1005,  1038,  2243],
       ...,
       [ 2189,  1997,  2307, ...,  1997,  2032,  1997],
       [ 2399,  2009,  1998, ...,  2032,  1999,  2032],
       [ 2051,  1996,  2274, ...,  1996,  2923,  1998]])

In [35]:
predictions.label_ids

array([[ -100,  -100,  -100, ...,  -100,  -100,  -100],
       [ -100,  -100,  -100, ...,  -100,  -100,  -100],
       [ -100,  -100,  -100, ...,  8540, 12256, 26775],
       ...,
       [ -100,  -100,  -100, ...,  -100,  -100,  -100],
       [15756,  6125,  -100, ...,  -100,  -100,  -100],
       [ -100,  -100,  -100, ...,  -100,  -100,  -100]])

[About micro, macro, weighted precision, recall, f1 for multiclass labels](https://towardsdatascience.com/multi-class-metrics-made-simple-part-ii-the-f1-score-ebe8b2c2ca1)

The following always holds true for the micro-F1 case:

`micro-F1 = micro-precision = micro-recall = accuracy`

In [36]:
compute_metrics(logits = predictions.predictions, labels = predictions.label_ids)
# metric.compute(predictions=preds, references=predictions.label_ids)

{'precision': 0.22435897435897437,
 'recall': 0.22435897435897437,
 'f1': 0.22435897435897437,
 'accuracy': 0.22435897435897437}

## Try from API

In [56]:
from transformers import pipeline

# Initialize MLM pipeline
mlm = pipeline('fill-mask', model=BERTModified_model, tokenizer=my_tokenizer)

# Get mask token
mask = mlm.tokenizer.mask_token

# Get result for particular masked phrase
phrase = f'Wikipedia is a free online {mask}, created and edited by volunteers around the world'

result = mlm(phrase)

# Print result
print(result)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

In [None]:
for x in result:
    print(f">>> {x['sequence']}")

In [None]:
- Add a layer for KG embedding, add the losses
- new tokenizer for test data