Install the necessary libraries - Tested with Python 3.10

In [1]:
!pip install numpy==1.24.2 torch==2.0.0 anytree transformers==4.27.4 safetensors sentencepiece

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting numpy==1.24.2
  Downloading numpy-1.24.2-cp310-cp310-macosx_10_9_x86_64.whl.metadata (5.6 kB)
Collecting torch==2.0.0
  Downloading torch-2.0.0-cp310-none-macosx_10_9_x86_64.whl.metadata (23 kB)
Collecting anytree
  Downloading anytree-2.12.1-py3-none-any.whl.metadata (8.1 kB)
Collecting transformers==4.27.4
  Downloading transformers-4.27.4-py3-none-any.whl.metadata (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting safetensors
  Downloading safetensors-0.4.3-cp310-cp310-macosx_10_12_x86_64.whl.metadata (3.8 kB)
Collecting filelock (from torch==2.0.0)
  Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy (from torch==2.0.0)
  Downloading sympy-1.13.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch==2.0.0)
  Downloading networkx-3.3-py3-none-any.

Import the necessary libraries

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from lrml import *

  from .autonotebook import tqdm as notebook_tqdm


Load the model from Huggingface

In [2]:
model_path = 'sffc348/t5-base-lrml-autocomplete'
model = T5ForConditionalGeneration.from_pretrained('sffc348/t5-base-lrml-autocomplete', use_safetensors=True)
model.eval()



T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [3]:
tokenizer_name = 't5-base'

def load_tokenizer():
    tokenizer = T5Tokenizer.from_pretrained(tokenizer_name)
    tokenizer.add_tokens(['<sep>'], special_tokens=True)
    tokenizer.sep_token = '<sep>'
    tokenizer.sep_token_id = tokenizer.convert_tokens_to_ids(
        tokenizer.sep_token)
    return tokenizer

tokenizer = load_tokenizer()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
# Preprocessing of text
def normalise_text(text):
    text = text.strip()
    if text and text[-1] != '.':
        text += '.'
    return text

# Generate a prediction based on the input text and the LRML
def predict(text, lrml):
    num_beams = 5
    num_return_sequences = 5
    no_repeat_ngram_size = 8
    max_length = 256
    early_stopping = True
    print(text, lrml)
    if lrml.strip() != '':
        lrml = '<sep>' + lrml
    else:
        lrml = ''
    tokens = tokenizer('translate English to LegalRuleML: ' +
                            normalise_text(text) + lrml, return_tensors='pt')
    with torch.no_grad():
        generation = model.generate(inputs=tokens.input_ids, max_length=max_length, num_beams=num_beams,
                                            num_return_sequences=num_return_sequences, early_stopping=early_stopping,
                                            no_repeat_ngram_size=no_repeat_ngram_size)

    return [post_process(i) for i in tokenizer.batch_decode(generation, skip_special_tokens=True)]

# Functions for postprocessing
def post_process(lrml):
    lrml = lrml.strip()
    # lrml = lrml[lrml.find('if('):]
    lrml = lrml.replace('[', '(').replace(']', ')').replace(
        '{', '(').replace('}', ')')
    lrml = lrml.replace(').', ')')
    lrml = fix_then(lrml, ' ')
    lrml = revert_tree_based_spacing(lrml)
    lrml = add_space_after_comma(lrml)

    return lrml

def clean_pred(lrml):
    prefix = ''
    lrml = lrml.replace(', ', ',')

    lrml = reverse_loop(lrml, prefix=prefix)
    lrml = reverse_resolve_expressions(lrml, fix_errors=True, prefix=prefix)
    lrml = reverse_combine_rel_and_var(lrml, prefix=prefix)
    lrml = reverse_move_and_or_to_data_node(lrml)
    lrml = reverse_units(lrml, prefix=prefix)
    return lrml

Do the actual prediction, the LRML input decides the scope of the prediction

In [5]:
text = 'The floor waste shall have a minimum diameter of 40 mm.'
# lrml = ''
lrml = 'if('
# lrml = 'if(exist(floorWaste)), then(obligation('
predictions = predict(text, lrml)
predictions

The floor waste shall have a minimum diameter of 40 mm. if(


['exist(floorWaste)',
 'and(has(floorWaste, diameter), greaterThanEqual(diameter, 40 mm))',
 'and(greaterThanEqual(floorWaste.diameter, 40 mm))',
 'and(has(floorWaste, diameter), greaterThanEqual(diameter, 40 mm))',
 'and(has(floorWaste, diameter), greaterThanEqual(diameter, 40 mm))']

Post-process the output

In [6]:
clean_pred(predictions[0])

'expression(function(exist),atom(variable(floorWaste)))'