In [None]:
!pip install rouge_score
!pip install evaluate
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=3b8d89599f1cc575b3390504973456e936e022b4d24dfec325e7d3f2b37e578c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.7 MB/s[0m eta [3

### Fine tuning code

In [None]:
import json
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset

json_data={}

with open('rebel_data2.txt', 'r') as f:
    json_data = json.load(f)

# Create an empty dataframe with two columns: text and labels
df = pd.DataFrame(columns=['id', 'title', 'context', 'triplets'])

for item in json_data:
    df = df.append({
        'id': item['id'],
        'title': item['title'],
        'context': item['context'],
        'triplets': item['triplets']
    }, ignore_index=True)

dataset = Dataset.from_pandas(df)

In [None]:
dataset

Dataset({
    features: ['id', 'title', 'context', 'triplets'],
    num_rows: 125
})

In [None]:
def freeze_model(model,num_layers = 1):
  """
  Freeze last num_layers of a model to prevent ctastrophic forgetting.
  Doesn't seem to work weel, its better to fine tune the entire netwok
  """
  for id , params in enumerate(model.parameters()):
    if id == len(list(model.parameters())) - num_layers: 
      print("last layer unfreezed")
      params.requires_grad = True
    else:
      params.requires_grad = False
  return model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

dataset = dataset.train_test_split(test_size=0.1)

tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
     output_dir="./results",
     evaluation_strategy="steps",
     eval_steps=500,
     save_steps=500,
     num_train_epochs=3,
     per_device_train_batch_size=8,
     per_device_eval_batch_size=8,
     learning_rate=1e-4,
     warmup_steps=500,
     weight_decay=0.01,
     logging_dir="./logs",
     logging_steps=100,
 )

def preprocess_function(example):
    input_text = example["context"]
    target_text = example["triplets"]
    model_inputs = tokenizer(input_text, padding=True, truncation=True)
    model_targets = tokenizer(target_text, padding=True, truncation=True)
    return {
        "input_ids": model_inputs.input_ids,
        "attention_mask": model_inputs.attention_mask,
        "decoder_input_ids": model_targets.input_ids,
        "decoder_attention_mask": model_targets.attention_mask,
        "labels": model_targets.input_ids,
    }

# Apply the preprocess function to the dataset
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Split the dataset into training and evaluation sets
train_dataset = encoded_dataset["train"]
eval_dataset = encoded_dataset["test"]

# Create a Seq2SeqTrainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_model")

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


('saved_model/tokenizer_config.json',
 'saved_model/special_tokens_map.json',
 'saved_model/vocab.json',
 'saved_model/merges.txt',
 'saved_model/added_tokens.json',
 'saved_model/tokenizer.json')

In [None]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

In [None]:
# Text to extract triplets from
text = "Tesla, the electric vehicle company, has announced plans to reduce its carbon footprint by implementing sustainable practices in its operations."

# Tokenizer text
model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')

gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 10,
    "num_return_sequences": 10,
}

# Generate
generated_tokens = model.generate(
    model_inputs["input_ids"].to(model.device),
    attention_mask=model_inputs["attention_mask"].to(model.device),
    **gen_kwargs,
)



# Extract text
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

# Extract triplets
for idx, sentence in enumerate(decoded_preds):
    print(f'Prediction triplets sentence {idx}')
    print(extract_triplets(sentence))

Prediction triplets sentence 0
[{'head': 'Tesla', 'type': 'product or material produced', 'tail': 'electric vehicle'}]
Prediction triplets sentence 1
[{'head': 'Tesla', 'type': 'industry', 'tail': 'electric vehicle company'}]
Prediction triplets sentence 2
[{'head': 'Tesla', 'type': 'product or material produced', 'tail': 'electric vehicle company'}]
Prediction triplets sentence 3
[{'head': 'Tesla', 'type': 'product or material produced', 'tail': 'electric'}]
Prediction triplets sentence 4
[{'head': 'Tesla', 'type': 'instance of', 'tail': 'electric vehicle company'}]
Prediction triplets sentence 5
[{'head': 'Elon Musk', 'type': 'employer', 'tail': 'Tesla'}]
Prediction triplets sentence 6
[{'head': 'Tesla', 'type': 'product or material produced', 'tail': 'electric vehicles'}]
Prediction triplets sentence 7
[{'head': 'Tesla', 'type': 'product or material produced', 'tail': 'electric vehicle'}, {'head': 'electric vehicle', 'type': 'manufacturer', 'tail': 'Tesla'}]
Prediction triplets sent

In [None]:
decoded_preds

['<s><triplet> Tesla <subj> electric vehicle <obj> product or material produced</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<s><triplet> Tesla <subj> electric vehicle company <obj> industry</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<s><triplet> Tesla <subj> electric vehicle company <obj> product or material produced</s><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<s><triplet> Tesla <subj> electric <obj> product or material produced</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<s><triplet> Tesla <subj> electric vehicle company <obj> instance of</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<s><triplet> Elon Musk <subj> Tesla <obj> employer</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<s><triplet> Tesla <subj> electric vehicles <obj> product or material produced</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<s><triplet> Tesla <subj> electric vehicle <obj> product or mat