<a href="https://colab.research.google.com/github/protocol-streams/querent-experimental/blob/main/BERT_Relationship_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.4 MB/s[0m eta [36m0:00:0

In [4]:
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torch.optim import SGD

# Mock dataset
data = [
    {"sentence": "Bill Gates is the founder of Microsoft", "entity1": "Bill Gates", "entity2": "Microsoft", "relationship": "founder of"},
    {"sentence": "Steve Jobs co-founded Apple", "entity1": "Steve Jobs", "entity2": "Apple", "relationship": "co-founded"},
    # ... add more data points as needed
]

df = pd.DataFrame(data)


In [5]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def encode_sentences(sentence, entity1, entity2):
    return tokenizer(f"[CLS] {entity1} [E1] {sentence} [E2] {entity2} [SEP]", padding='max_length', max_length=512, truncation=True, return_tensors="pt")

df['encoded'] = df.apply(lambda row: encode_sentences(row['sentence'], row['entity1'], row['entity2']), axis=1)


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
class BertRelationModel(torch.nn.Module):
    def __init__(self, num_labels):
        super(BertRelationModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=num_labels)

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
        return output


In [7]:
class RelationDataset(Dataset):
    def __init__(self, dataframe, label2id):
        self.data = dataframe
        self.label2id = label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_ids = item['encoded']['input_ids'].squeeze(0)
        attention_mask = item['encoded']['attention_mask'].squeeze(0)
        label = torch.tensor(self.label2id[item['relationship']])
        return input_ids, attention_mask, label

label2id = {label: idx for idx, label in enumerate(df['relationship'].unique())}
id2label = {v: k for k, v in label2id.items()}
dataset = RelationDataset(df, label2id)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [8]:
def train_loop(model, dataloader):
    optimizer = SGD(model.parameters(), lr=5e-3)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(5):  # 5 epochs for demonstration
        total_loss = 0
        for input_ids, attention_mask, labels in tqdm(dataloader):
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            optimizer.zero_grad()
            loss, logits = model(input_ids, attention_mask, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

model = BertRelationModel(len(label2id))
train_loop(model, dataloader)


Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:12<00:00, 12.01s/it]


Epoch 1, Loss: 0.7688916325569153


100%|██████████| 1/1 [00:12<00:00, 12.02s/it]


Epoch 2, Loss: 0.644312858581543


100%|██████████| 1/1 [00:12<00:00, 12.49s/it]


Epoch 3, Loss: 0.48022162914276123


100%|██████████| 1/1 [00:11<00:00, 11.98s/it]


Epoch 4, Loss: 0.2663092017173767


100%|██████████| 1/1 [00:12<00:00, 12.64s/it]

Epoch 5, Loss: 0.5333778858184814





In [13]:
def evaluate(model, sentence, entity1, entity2):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    encoded = encode_sentences(sentence, entity1, entity2)
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask, None)
        if isinstance(logits, tuple):  # Check if the output is a tuple
            logits = logits[0]
    prediction = logits.argmax(dim=1).item()
    return id2label[prediction]





# Test
print(evaluate(model, "Elon Musk is the CEO of Tesla", "Elon Musk", "Tesla"))


co-founded
