In [2]:
from transformers import pipeline

triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large', device='cpu')

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 113632d0-c238-4c0e-932e-16075689ecc0)')' thrown while requesting HEAD https://huggingface.co/Babelscape/rebel-large/resolve/main/config.json


Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

In [3]:
#function to extract kg triplets from data
#code from Babelscape/rebel-large model card
#https://huggingface.co/Babelscape/rebel-large

def extract_triplets(input_text):
    text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(input_text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])[0]

    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append((subject.strip(), relation.strip(), object_.strip()))

    return triplets

In [4]:
from llama_index import Document
import os
import json
def build_documents(directory):
  documents = []
  for article in os.listdir(directory):
      article_path = os.path.join(directory,article)
      with open(article_path, "r",encoding = 'utf-8') as f:
          article = json.loads(f.read())
      content = article['text']
      article.pop('text')
      article.pop('url')

      #this is a very rare occurance (happens once as of 8/11) so this quick fix is fine
      description = article['description']
      if len(description)>350:
          article['description'] = description[:350]


      doc = Document(text=content,metadata=article)
      doc.id_ = article_path
      documents.append(doc)
  return documents

In [5]:
from llama_index.node_parser import SimpleNodeParser
def build_nodes(documents, chunk_size, chunk_overlap):
    node_parser = SimpleNodeParser.from_defaults(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    nodes = node_parser.get_nodes_from_documents(documents=documents, show_progress=True)
    return nodes

In [6]:
from llama_index.prompts.prompts import SimpleInputPrompt
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding
from llama_index import ServiceContext
from llama_index.llms import HuggingFaceLLM

#build service context
system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

#llm needs to be passed in for the embedding because of how the faiss vector store is implemented on llamaindex
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="StabilityAI/stablelm-tuned-alpha-3b",
    model_name="StabilityAI/stablelm-tuned-alpha-3b",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096},
    offload_folder = 'offload',
    # uncomment this if using CUDA to reduce memory usage
    #model_kwargs={"torch_dtype": torch.float16}
)


embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs = {'device': 'cpu'})
)

service_context = ServiceContext.from_defaults(chunk_size=512,llm=llm,embed_model=embed_model)

ValueError: The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.

In [None]:
#build knowledge graph index
from llama_index import KnowledgeGraphIndex

documents = build_documents('all_uva_news_articles_individual')
nodes = build_nodes(documents,512,128)


index = KnowledgeGraphIndex(nodes=nodes, kg_triplet_extract_fn=extract_triplets, service_context=service_context)