# LLM for information extraction using semantics

In [1]:
! pip install ollama
! pip install langchain
! pip install langchain-community



In [4]:
# LangChain supports many other chat models. Here, we're using Ollama
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# supports many more optional parameters. Hover on your `ChatOllama(...)`
# class to view the latest available supported parameters
llm = ChatOllama(model="llama3:8b",temperature=0)

prompt = ChatPromptTemplate.from_template( """Given a text as an input, you only have to return the different entities mentioned with its  
    Wikidata code and the relationships that you are able to detect between the entities mentioned. You only have to act as an output of a system, you cannot explain
    analyze and introduce the results that you are going to generate.
                                          
    Example :
        Text: 'Apple is a technology company. It was founded by Steve Jobs and Steve Wozniak. The headquarters is in Cupertino, California.'                               

        Output:'
            Entities:                                   
            Apple(Q312)
            Steve Jobs(Q19837) 
            Steve Wozniak(Q483382)
            Cupertino,California(Q189471)

            Relationships:
            Apple(Q312)-founded by->Steve Jobs(Q19837)
            Apple(Q312)-founded by->Steve Wozniak(Q483382)
            Apple(Q312)-headquarters location->Cupertino,California(Q189471)'
                                                                                                                                                                                                                           
    The text that you have to analize is the following:                                      
 {text}""")

input_text = """
Marie Curie was a pioneering physicist and chemist. She discovered radioactivity and won Nobel Prizes in both Physics and Chemistry.
"""
chain = prompt | llm | StrOutputParser()
response = chain.invoke({"text": input_text})
print(response)

In [2]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.23.5-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading transformers-4.42.4-py3-none-any.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m 

In [19]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

text = "Apple is a technology company. It was founded by Steve Jobs and Steve Wozniak. The headquarters is in Cupertino, California."

In [21]:
model.generation_config

GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}

In [18]:

gen_kwargs = {
    "max_length": 1024,
    "length_penalty": 1,
    "num_beams": 3,
}

model_inputs = tokenizer(text, max_length=1024, padding=True, truncation=True, return_tensors = 'pt', add_special_tokens=False)
output = f"""<s><triplet>"""
model_outputs = tokenizer(output, max_length=1024, padding=True, truncation=True, return_tensors = 'pt', add_special_tokens=False)
generated_tokens = model.generate(
                    model_inputs["input_ids"].to(model.device),
                    #decoder_input_ids=model_outputs["input_ids"].to(model.device),
                    attention_mask=model_inputs["attention_mask"].to(model.device),
                    bad_words_ids=tokenizer(["<triplet>"], add_special_tokens=False).input_ids, # don't generate <triplet>
                    **gen_kwargs,
                )

decoded_pred = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
print(decoded_pred)


['<s><subj> Steve Jobs <subj> Cupertino, California <obj> residence <subj> Steve Wozniak <obj> influenced by</s>']


In [23]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets


In [25]:
from transformers import pipeline
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
text = "Apple is a technology company. It was founded by Steve Jobs and Steve Wozniak. The headquarters is in Cupertino, California."
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
print(extracted_text)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


['<s><triplet> Apple <subj> Steve Jobs <obj> founded by <subj> Cupertino, California <obj> headquarters location <triplet> Steve Jobs <subj> Apple <obj> employer <triplet> Steve Wozniak <subj> Apple <obj> employer</s>']


In [28]:
print(extract_triplets(extracted_text[0]))

[{'head': 'Apple', 'type': 'founded by', 'tail': 'Steve Jobs'}, {'head': 'Apple', 'type': 'headquarters location', 'tail': 'Cupertino, California'}, {'head': 'Steve Jobs', 'type': 'employer', 'tail': 'Apple'}, {'head': 'Steve Wozniak', 'type': 'employer', 'tail': 'Apple'}]
