In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [2]:
import os

import pandas as pd
import tiktoken

from graphrag.query.indexer_adapters import read_indexer_entities, read_indexer_reports
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch

## Global Search example

Global search method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole (e.g. What are the most significant values of the herbs mentioned in this notebook?).

In [3]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

### LLM setup

In [4]:
api_key = os.getenv("GRAPHRAG_API_KEY")
# llm_model = os.getenv("GRAPHRAG_LLM_MODEL")
llm_model = os.getenv("GRAPHRAG_LLM_DEPLOYMENT")
llm_deployment = os.getenv("GRAPHRAG_LLM_DEPLOYMENT")
# embedding_model = os.getenv("GRAPHRAG_EMBEDDING_MODEL")
embedding_model = os.getenv("GRAPHRAG_EMBEDDING_DEPLOYMENT")
embedding_deployment = os.getenv("GRAPHRAG_EMBEDDING_DEPLOYMENT")
api_base = os.getenv("GRAPHRAG_API_BASE")
api_version = os.getenv("GRAPHRAG_API_VERSION")

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
    api_base=api_base,
    api_version=api_version
)

token_encoder = tiktoken.get_encoding("cl100k_base")

### Load community reports as context for global search

- Load all community reports in the `create_final_community_reports` table from the ire-indexing engine, to be used as context data for global search.
- Load entities from the `create_final_nodes` and `create_final_entities` tables from the ire-indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the `rank` attribute in the community reports table for context ranking)

In [5]:
# parquet files generated from indexing pipeline
INPUT_DIR = "./../sample-output/output/20240812-215728/artifacts"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"

# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2

In [6]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
print(f"Total report count: {len(report_df)}")
print(
    f"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}"
)
report_df.head()

Total report count: 40
Report count after filtering by community level 2: 30


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,31,# Scrooge and the Spirits\n\nThe community cen...,2,8.5,Scrooge and the Spirits,The impact severity rating is high due to the ...,"The community centers around Ebenezer Scrooge,...",[{'explanation': 'Ebenezer Scrooge is initiall...,"{\n ""title"": ""Scrooge and the Spirits"",\n ...",df177946-0c93-4758-9204-eb763d47d5ac
1,32,# Scrooge and His Nephew\n\nThe community revo...,2,4.0,Scrooge and His Nephew,The impact severity rating is moderate due to ...,The community revolves around Scrooge and his ...,"[{'explanation': 'Scrooge's nephew, likely nam...","{\n ""title"": ""Scrooge and His Nephew"",\n ...",08fcc8c6-4a5c-4022-a938-81f6cab05d7c
2,33,# Scrooge and Marley's Supernatural Encounter\...,2,7.5,Scrooge and Marley's Supernatural Encounter,The impact severity rating is high due to the ...,The community centers around the relationship ...,[{'explanation': 'Marley's death is a well-doc...,"{\n ""title"": ""Scrooge and Marley's Supernat...",8028d7ea-8dd3-48e9-9e83-b66b355c6847
3,34,# Scrooge and the Spirit\n\nThe community revo...,2,8.5,Scrooge and the Spirit,The impact severity rating is high due to the ...,The community revolves around Scrooge and the ...,[{'explanation': 'The Spirit is a supernatural...,"{\n ""title"": ""Scrooge and the Spirit"",\n ...",31eab87c-2c7a-4214-abc1-98f1229d1e44
4,35,# Scrooge and the Ghost's Journey\n\nThe commu...,2,8.5,Scrooge and the Ghost's Journey,The impact severity rating is high due to the ...,The community revolves around Ebenezer Scrooge...,"[{'explanation': 'The Ghost, representing Jaco...","{\n ""title"": ""Scrooge and the Ghost's Journ...",bb7d4bb8-8f48-4da9-8fe3-53c105222283


#### Build global context based on community reports

In [7]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

#### Perform global search

In [8]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [9]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [10]:
result = await search_engine.asearch("what is the data you have")
print(result.response)

### Overview of the Data

The data consists of various reports related to the novella *A Christmas Carol* by Charles Dickens. These reports cover multiple aspects of the story, including character interactions, significant events, and thematic elements. Each report is identified by unique IDs and contains detailed content about specific topics within the novella.

### Key Characters and Their Interactions

The reports provide detailed descriptions of key characters such as Ebenezer Scrooge, the Cratchit family, and the Ghosts of Christmas Past, Present, and Future. They delve into Scrooge's transformation, his relationships with other characters, and the impact of his change on his community [Data: Reports (4, 13, 21, 22, 25, 28, 29, 30, 31, 33, 34, +more)].

### Significant Events and Locations

The data includes descriptions of significant events like Scrooge's supernatural encounters and Christmas celebrations. Important locations such as Scrooge's Counting-House, the Cratchit house

In [11]:
result = await search_engine.asearch("who is the main character of this story?")
print(result.response)

### Main Character of the Story

The main character of the story is **Ebenezer Scrooge**. He is central to the narrative and undergoes a significant transformation throughout the plot. Initially, Scrooge is depicted as a miserly and covetous individual, known for his cynical and unkind nature [Data: Reports (31, 4, 13, 34, 33)].

### Transformation Journey

Scrooge's journey is marked by profound changes, guided by various supernatural encounters. These encounters include interactions with the Ghost of Christmas Present and the ghost representing Jacob Marley, among others [Data: Reports (35, 38, 37, 1, 17, +more)]. Through these experiences, Scrooge evolves from a miserly, cynical person to one who is generous and compassionate.

### Conclusion

Ebenezer Scrooge's transformation is a central theme of the story, illustrating the potential for personal growth and redemption. His character arc serves as a powerful narrative device, emphasizing the impact of self-reflection and the influe

In [12]:
result = await search_engine.asearch("How is the main character connected to other characters in the story?")
print(result.response)

### Ebenezer Scrooge's Connections to Other Characters

Ebenezer Scrooge, the main character, is intricately connected to various other characters in the story, each playing a significant role in his transformation from a miserly, solitary figure to a generous and caring individual.

#### Bob Cratchit and Tiny Tim

Scrooge's relationship with his clerk, Bob Cratchit, and Bob's son, Tiny Tim, is central to the narrative. Initially, Scrooge keeps Bob in poor working conditions, reflecting his miserly nature and lack of empathy [Data: Reports (13, 31)]. However, through his supernatural journey, Scrooge witnesses the Cratchit family's struggles and Tiny Tim's frail condition, which deeply affects him. This revelation is pivotal in Scrooge's decision to change, leading him to become generous towards Bob and his family, ultimately becoming a second father to Tiny Tim [Data: Reports (31, 37)].

#### The Ghosts of Christmas

Scrooge's transformation is guided by the Ghosts of Christmas Past, 

In [19]:
result = await search_engine.asearch("Did someone become richer by someone becoming poorer?")
print(result.response)

### Analysis of Wealth Redistribution

The dataset provides a nuanced view of wealth redistribution, particularly through the actions of specific characters. The most prominent example involves Old Joe's shop, which serves as a central hub for the appraisal and sale of items stolen from a deceased man known as the Unhappy Man. Mrs. Dilber and Joe engage in these transactions, indicating that they may have become richer through the theft of the Unhappy Man's belongings [Data: Reports (24, 23, 27)]. This clearly illustrates a scenario where individuals gained wealth at the expense of another, albeit posthumously.

### The Case of Ebenezer Scrooge

Ebenezer Scrooge's initial wealth and miserly behavior are also highlighted in the dataset. Scrooge's wealth is attributed to his business acumen and frugality, but there is no direct evidence that his wealth was a result of someone else becoming poorer [Data: Reports (31, 4)]. However, his harsh treatment of his clerk, Bob Cratchit, suggests t

# allow_general_knowledge

In [14]:
question = "Who are Elon's Kids"
result = await search_engine.asearch(question)
print(result.response)



I am sorry but I am unable to answer this question given the provided data.


In [15]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=True,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

question = "Who are Elon's Kids"
result = await search_engine.asearch(question)
print(result.response)

Elon Musk, the CEO of SpaceX and Tesla, has a total of ten children. Here is a detailed overview of his children:

### Children with Justine Wilson
Elon Musk and his first wife, Justine Wilson, have five children together:
1. **Nevada Alexander Musk**: Born in 2002, Nevada tragically passed away at 10 weeks old due to sudden infant death syndrome (SIDS) [Data: Reports (1, 2, 3, 4, 5)].
2. **Griffin Musk**: Born in 2004, Griffin is one of the twins conceived through IVF [Data: Reports (1, 2, 3, 4, 5)].
3. **Xavier Musk**: Also born in 2004, Xavier is Griffin's twin [Data: Reports (1, 2, 3, 4, 5)].
4. **Kai Musk**: Born in 2006, Kai is one of the triplets conceived through IVF [Data: Reports (1, 2, 3, 4, 5)].
5. **Saxon Musk**: Also born in 2006, Saxon is one of the triplets [Data: Reports (1, 2, 3, 4, 5)].
6. **Damian Musk**: The third of the triplets born in 2006 [Data: Reports (1, 2, 3, 4, 5)].

### Children with Claire Boucher (Grimes)
Elon Musk and the musician Claire Boucher, known

In [16]:
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]

Unnamed: 0,id,title,occurrence weight,content,rank
0,35,Scrooge and the Ghost's Journey,0.416667,# Scrooge and the Ghost's Journey\n\nThe commu...,8.5
1,38,Christmas Celebrations and Scrooge's Transform...,0.361111,# Christmas Celebrations and Scrooge's Transfo...,8.5
2,37,The Cratchit Family and Scrooge,0.333333,# The Cratchit Family and Scrooge\n\nThe commu...,7.5
3,1,Ghost of Christmas Present and Scrooge,0.25,# Ghost of Christmas Present and Scrooge\n\nTh...,7.5
4,17,Tiny Tim and the Cratchit Family,0.222222,# Tiny Tim and the Cratchit Family\n\nThe comm...,7.5
5,26,Fezziwig and Christmas Eve Community,0.222222,# Fezziwig and Christmas Eve Community\n\nThe ...,7.5
6,20,Fred's Dinner Party and Scrooge's Relations,0.166667,# Fred's Dinner Party and Scrooge's Relations\...,4.5
7,24,Old Joe's Shop and the Theft of the Unhappy Ma...,0.111111,# Old Joe's Shop and the Theft of the Unhappy ...,7.5
8,14,"Scrooge, Gentleman, and the Poor",0.083333,"# Scrooge, Gentleman, and the Poor\n\nThe comm...",7.5
9,5,Caroline and the Merciless Creditor,0.083333,# Caroline and the Merciless Creditor\n\nThe c...,6.5


In [17]:
# inspect number of LLM calls and tokens
print(f"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}")

LLM calls: 3. LLM tokens: 23506
