In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained('data/bert')

model = AutoModelForMaskedLM.from_pretrained('data/bert')
tokenizer = AutoTokenizer.from_pretrained('data/bert')

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [2]:
from datasets import Dataset
ds=Dataset.from_json('data/data.json')
ds

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['language', 'fileName', 'prefix', 'suffix', 'completion', 'expected', 'parameters'],
    num_rows: 15472
})

In [18]:
# sort by characters, to make batching more efficient
def count_chars(example):
    return {'chars': len(example['prefix'])+len(example['suffix'])}
dsSorted=ds.map(count_chars,batched=False, num_proc=8).sort('chars')
dsSorted

Dataset({
    features: ['language', 'fileName', 'prefix', 'suffix', 'completion', 'expected', 'parameters', 'chars'],
    num_rows: 15472
})

In [19]:
import torch
import time
import torch.nn.functional as F
import math

mask_id = tokenizer.convert_tokens_to_ids("<|mask|>")
pad_id = tokenizer.convert_tokens_to_ids("<|pad|>")

def padToLength(list,length, padding):
    result=list[:length]
    return result+[padding]*(length-len(result));

languageIds={
    'cs': 0,
    'ts': 1,
    'css': 2,
};

def generate_response(examples):
    # Tokenize all prefixes and suffixes together
    prefix_ids = tokenizer(examples["prefix"], add_special_tokens=False,split_special_tokens=True)["input_ids"]
    suffix_ids = tokenizer(examples["suffix"], add_special_tokens=False,split_special_tokens=True)["input_ids"]


    # Combine the IDs for each example in the batch
    prompt_ids = [
       prefix + [mask_id]*5 + suffix
       for prefix, suffix in zip(prefix_ids, suffix_ids)
    ]

    attention_mask = [[1] * len(ids) for ids in prompt_ids]

    # pad batch
    max_length= 8*math.ceil(max([len(ids) for ids in prompt_ids])/8)
    p_prompt_ids=[padToLength(ids,max_length, pad_id) for ids in prompt_ids]
    p_attention_mask=[padToLength(ids,max_length, 0) for ids in attention_mask]
    p_token_type = [[languageIds[lang]] * max_length for lang in examples["language"]]

    outputs = F.softmax(model(**{
        "input_ids": torch.tensor(p_prompt_ids, dtype=torch.int64),
        "attention_mask":torch.tensor(p_attention_mask, dtype=torch.int64),
        'token_type_ids':torch.tensor(p_token_type, dtype=torch.int64)
    }).logits,dim=-1).argmax(-1)
    outputs = [out[len(prefix):len(prefix)+5] for out,prefix in zip(outputs,prefix_ids)]
    response= [tokenizer.decode(ids, clean_up_tokenization_spaces=True, skip_special_tokens=True) for ids in outputs]
    return {
        'response': response, 
        'match': [r.startswith(exp) for r,exp in zip(response, examples['expected'])]
    }

ds2=dsSorted.map(generate_response, batched=True, batch_size=32)

Map:   0%|          | 0/15472 [00:00<?, ? examples/s]

In [25]:
import json
rows=[]
for example in ds2.shuffle().take(20):
    print('PREFIX: '+example['prefix'])
    print('EXPECTED: '+example['expected'])
    print('RESPOSE: '+example['response'])
    print('MATCH: '+str(example['match']))
    print()
    row=example['parameters'].copy()
    row['match']=example['match']
    rows.append(row)
# print(rows)
with open('data/results2.json', 'w') as file:
    json.dump(rows, file, indent=2)

PREFIX: 1) / Time.Hour.ToBase(1));

    public double BaseUnitValue { get; }

    public VolumeFlow(double baseUnitValue)
    {
        this.BaseUnitValue = baseUnitValue;
    }

    public static VolumeFlow 
EXPECTED: FromBaseUn
RESPOSE: Get get
MATCH: False

PREFIX: ForMassFlowAndTemperature(ocl.Id, ocl.DisplayName(args.NumberAssignment), calculation, null, state => massFlow * (ocl.GetDirection() == Direction.Out ? 1 : -1), state => temperature.Value, state => ma
EXPECTED: ssFractions, 
RESPOSE: x..
MATCH: False

PREFIX: eInfo };

        // add starting element
        if (endpoint.Port != null)
        {
            this.AddMainFlowElement(endpoint.Port.Element, endpoint.Port, flow);
        }

        if (endpoint.
EXPECTED: Ocl != 
RESPOSE: start.
MATCH: False

PREFIX: is.SecondaryInPorts.Concat(this.SecondaryOutPorts))
        {
            port.Delete();
        }

        this.SecondaryInPorts.Clear();
        this.SecondaryOutPorts.Clear();
        this.Initiali
EXPECTED: zeP