# Generate Knowledge graph triplets using llama-index 

In [None]:
# %%capture
# !pip install llama-index-embeddings-azure-openai
# !pip install llama-index-llms-azure-openai
# !pip install llama-index

In [1]:
import os
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [2]:
from llama_index.core import (
    SimpleDirectoryReader,
    ServiceContext,
    KnowledgeGraphIndex,
)
from llama_index.core.graph_stores import SimpleGraphStore

# from llama_index.llms.openai import OpenAI
from llama_index.llms.azure_openai import AzureOpenAI
# from IPython.display import Markdown, display

## Read reference and summary text

In [3]:
documents = SimpleDirectoryReader(
    "../../data/reference/"
).load_data()

summary_docs = SimpleDirectoryReader(
    "../../data/summary/"
).load_data()

print(documents[0].text)

Title: City Introduces New Green Initiative to Combat Air Pollution

Content:
The city council has launched a new green initiative aimed at reducing air pollution and promoting environmental sustainability. The initiative includes planting 10,000 trees over the next two years, creating more bike lanes, and increasing the number of electric vehicle charging stations. Mayor Jane Smith emphasized the importance of community involvement in making the city a cleaner and healthier place to live. "We are committed to improving air quality and reducing our carbon footprint," said Mayor Smith. The project is set to begin next month, with the first phase focusing on urban areas with the highest levels of pollution.


In [4]:
print(summary_docs[0].text)

The city council has announced a comprehensive green initiative designed to tackle air pollution and enhance environmental sustainability. Key components of the plan involve planting 10,000 trees, expanding bike lanes, and boosting the availability of electric vehicle charging stations. Mayor Jane Smith highlighted the critical role of community participation in ensuring the initiative's success. Interestingly, the initiative also includes a controversial plan to reduce public transportation services, which has raised concerns among residents. The project will commence next month, starting with the most polluted urban areas.


## Define model params

In [29]:
# Enter these fields only if using openai 
api_key = "<OPEN AI KEY>"
model_name = "gpt-3.5-turbo"
# model_name = "gpt-3.5-turbo-instruct"

In [5]:
# For llamaindex's Azure open ai
api_type = 'azure'
api_key = "<AZURE OPEN AI KEY>"
api_base = "<ENTER AZURE DEPLOYMENT END POINT HERE>"
api_version="<API VERSION>"
deployment_name = "<DEPLOYMENT NAME>" 
model_name = "gpt-35-turbo"

In [16]:
# Only useful if one is using openai instead of llama-index's openai library
# import openai
# openai.api_key = api_key
# openai.api_base = api_base
# openai.api_type = api_type
# openai.api_version = api_version

In [6]:
os.environ["OPENAI_API_KEY"] = api_key

In [29]:
llm = AzureOpenAI(
    model=model_name,
    deployment_name=deployment_name,
    api_key=api_key,
    azure_endpoint=api_base,
    api_version=api_version,
)

# llm = OpenAI(temperature=0.1, model=model_name)

  service_context = ServiceContext.from_defaults(llm=llm, chunk_size=64, chunk_overlap=30)


#### Effect of chunk size and max_triplets_per_chunk 
When the chunksize is kept high, say 512, since the number triplets are upper bounded by the max_triplets_per_chunk value, 
we may not get all the desired triplets. 

When we reduce the chunk size, the llm gets more chunks to return triplets and even if the triplets are limited per chunk
each chunk should contribute to its own triplets and effectively, total triplets should increase. 

Hence one should keep the chunksize lower and max_triplets_per_chunk higher, to get exhaustive set of triplets. 

NOTE that the values should not be kept such that one starts getting noise in the graph.  

In [None]:
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512, chunk_overlap=200)
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=64, chunk_overlap=30)

In [30]:
from llama_index.core.storage.storage_context import StorageContext

graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

## Create Knowledge graph for the reference

In [31]:
# NOTE: this can take few seconds!
reference_index = KnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=15,
    storage_context=storage_context,
    service_context=service_context,
    include_embeddings=False,
)


Metadata length (21) is close to chunk size (64). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
INFO:httpx:HTTP Request: POST https://test-atg-openai-two.openai.azure.com//openai/deployments/gpt35-feb/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-atg-openai-two.openai.azure.com//openai/deployments/gpt35-feb/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-atg-openai-two.openai.azure.com//openai/deployments/gpt35-feb/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-atg-openai-two.openai.azure.com//openai/deployments/gpt35-feb/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"


In [32]:
ref_query_engine = reference_index.as_query_engine(
    include_text=False, response_mode="tree_summarize"
)


In [38]:
# !pip install pyvis

## Create graph

In [33]:
from pyvis.network import Network

g = reference_index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
net.show("reference.html")

reference.html


In [34]:
net.write_html("../../data/reference_output/reference_graph.html",notebook=True)

In [35]:
net.get_edges()

[{'label': 'Launched',
  'title': 'Launched',
  'width': 1,
  'from': 'City council',
  'to': 'Green initiative',
  'arrows': 'to'},
 {'label': 'Aimed at',
  'title': 'Aimed at',
  'width': 1,
  'from': 'Green initiative',
  'to': 'Reducing air pollution',
  'arrows': 'to'},
 {'label': 'Aimed at',
  'title': 'Aimed at',
  'width': 1,
  'from': 'Green initiative',
  'to': 'Promoting environmental sustainability',
  'arrows': 'to'},
 {'label': 'Includes',
  'title': 'Includes',
  'width': 1,
  'from': 'Initiative',
  'to': 'Planting trees',
  'arrows': 'to'},
 {'label': 'Includes',
  'title': 'Includes',
  'width': 1,
  'from': 'Initiative',
  'to': 'Creating bike lanes',
  'arrows': 'to'},
 {'label': 'Includes',
  'title': 'Includes',
  'width': 1,
  'from': 'Initiative',
  'to': 'Increasing charging stations',
  'arrows': 'to'},
 {'label': 'Emphasized',
  'title': 'Emphasized',
  'width': 1,
  'from': 'Jane smith',
  'to': 'Importance of community involvement',
  'arrows': 'to'},
 {'la

In [36]:
def extract_triplets(graph):
    triplets = []
    temp_dict  = {}
    graph_edges = graph.get_edges()
    for each_edge in graph_edges:
        temp_dict['label'] = each_edge['label']
        temp_dict['from'] = each_edge['from']
        temp_dict['to'] = each_edge['to']
        triplets.append(temp_dict)
        temp_dict = {}
    return triplets

ref_triplets = extract_triplets(net)

In [37]:
len(ref_triplets)

25

In [38]:
import json
with open("../../data/reference_output/context_triplets.txt", "w") as fp:    
    json.dump(ref_triplets, fp)

## Create knowledge graph for the summary

In [63]:
graph_store2 = SimpleGraphStore()

service_context_2 = ServiceContext.from_defaults(llm=llm, chunk_size=64, chunk_overlap=30)
storage_context_2 = StorageContext.from_defaults(graph_store=graph_store2)

  service_context_2 = ServiceContext.from_defaults(llm=llm, chunk_size=64, chunk_overlap=30)


In [64]:
summary_index = KnowledgeGraphIndex.from_documents(
    summary_docs,
    max_triplets_per_chunk=10,
    storage_context=storage_context_2,
    service_context=service_context_2,
)

summary_query_engine = summary_index.as_query_engine(
    include_text=False, response_mode="tree_summarize"
)


Metadata length (22) is close to chunk size (64). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
INFO:httpx:HTTP Request: POST https://test-atg-openai-two.openai.azure.com//openai/deployments/gpt35-feb/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-atg-openai-two.openai.azure.com//openai/deployments/gpt35-feb/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-atg-openai-two.openai.azure.com//openai/deployments/gpt35-feb/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-atg-openai-two.openai.azure.com//openai/deployments/gpt35-feb/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-atg-openai-two.openai.azure.com//openai/deployments/gpt35-feb/chat/completions?api-version=

In [65]:
summary_g = summary_index.get_networkx_graph()
summary_net = Network(notebook=True, cdn_resources="in_line", directed=True)
summary_net.from_nx(summary_g)
summary_net.show("summary.html")

summary.html


In [66]:
summary_net.write_html("../../data/summary_output/summary_graph2.html")

In [67]:
summary_triplets = extract_triplets(summary_net)

In [68]:
summary_triplets

[{'label': 'Announced', 'from': 'City council', 'to': 'Green initiative'},
 {'label': 'Designed to tackle',
  'from': 'Green initiative',
  'to': 'Air pollution'},
 {'label': 'Designed to enhance',
  'from': 'Green initiative',
  'to': 'Environmental sustainability'},
 {'label': 'Involves', 'from': 'Plan', 'to': 'Expanding bike lanes'},
 {'label': 'Involves',
  'from': 'Plan',
  'to': 'Boosting availability of electric vehicle charging stations'},
 {'label': 'Reduce', 'from': 'Plan', 'to': 'Public transportation services'},
 {'label': 'Highlighted', 'from': 'Mayor jane smith', 'to': 'Critical role'},
 {'label': 'Highlighted',
  'from': 'Mayor jane smith',
  'to': 'Critical role of community participation'},
 {'label': 'Ensures',
  'from': 'Community participation',
  'to': "Initiative's success"},
 {'label': 'Includes',
  'from': 'Initiative',
  'to': 'Controversial plan to reduce public transportation services'},
 {'label': 'Includes', 'from': 'Initiative', 'to': 'Controversial plan'}

In [69]:
with open("../../data/summary_output/summary_triplets2.txt", "w") as fp:    
    json.dump(summary_triplets, fp)

#### Observations:
1. Summary may not be completely extractive but abstractive in which case s-v-o kind of relation will never match
2. If it is abstractive, use GENAI to find if node1 == node n
3. Summary may not include less important facts like time, date etc which could be present in the context(and its graph)
4. Relations/edges could be different in which case there is an extra effort to find the best matching relation