In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [5]:
import os

import pandas as pd
import tiktoken
import graphrag
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
from graphrag.query.structured_search.local_search.system_prompt import (
    EXPLORE_MULTIHOP_LOCAL_SEARCH_SYS_PROMPT,
)

print(graphrag.__file__)

C:\Users\YantingZhoufromMotio\Desktop\ragtest\graphrag\graphrag\__init__.py


## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [2]:
INPUT_DIR = ".."
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [3]:
# read nodes table to get community and degree data
# file_name = '20240724-135713 graphrag'
# file_name = '20240717-143037_stat'
# file_name = '20240806-175239_multi_hop'
file_name = '20240823-142207_multi_hop_with_source'


INPUT_DIR = '../output/' + file_name + '/artifacts'
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
print(f"Entity embedding count: {len(entity_embedding_df)}")

print(len(entities))

Entity count: 28588
Entity embedding count: 7147
7147


#### Read relationships

In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 7387


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,YANKEES,YOSHINOBU YAMAMOTO,1.0,The Yankees are reportedly going to meet with ...,[91463b5c2ee8f09bd80669546924f65b],56397382f94943eba476308c16c456a2,0,7,9,16
1,YANKEES,ALEX VERDUGO,1.0,Alex Verdugo was acquired by the Yankees from ...,[91463b5c2ee8f09bd80669546924f65b],12faf9ade3f44938bf7a1c01ca9710bb,1,7,1,8
2,YANKEES,JUAN SOTO,1.0,The Yankees are reportedly intensifying effort...,[91463b5c2ee8f09bd80669546924f65b],5f87186a7ff743eab62b8ac93e1e8593,2,7,7,14
3,YANKEES,GRISHAM,1.0,Trading Grisham would allow the Yankees to mov...,[ddf013dc0ee0f038d5a08a73a547ffd1],b43483bbf569425a8f781213b876aaef,3,7,1,8
4,YANKEES,KIM,1.0,Trading Kim would allow Jake Cronenworth to mo...,[ddf013dc0ee0f038d5a08a73a547ffd1],a0aba96de6d54a5292bf50d110b06145,4,7,1,8


In [6]:
# covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

# claims = read_indexer_covariates(covariate_df)

# print(f"Claim records: {len(claims)}")
# covariates = {"claims": claims}

#### Read community reports

In [7]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 91


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,1019,"# Elon Musk, X, and the Tech Community's Conte...",3,8.5,"Elon Musk, X, and the Tech Community's Content...",The high impact severity rating is due to the ...,"The community is centered around Elon Musk, th...",[{'explanation': 'Elon Musk is a key figure in...,"{\n ""title"": ""Elon Musk, X, and the Tech Co...",f124b885-6a7c-40a6-b7c1-f972a4e66f41
1,1026,# Amazon Echo Buds and Nothing Earbuds Communi...,3,3.5,Amazon Echo Buds and Nothing Earbuds Community,The impact severity rating is low due to the c...,The community is centered around the Amazon Ec...,[{'explanation': 'Amazon is the central entity...,"{\n ""title"": ""Amazon Echo Buds and Nothing ...",493602c6-299c-455f-9a11-5a1d41a72e42
2,1027,# Australia and India: Rivals in Cricket and L...,3,7.5,Australia and India: Rivals in Cricket and Lea...,The impact severity rating is high due to the ...,The community is centered around the competiti...,[{'explanation': 'Australia's women's cricket ...,"{\n ""title"": ""Australia and India: Rivals i...",2ee01646-5b62-4030-a28b-9e6eee7342e5
3,1028,# Alyssa Healy and the Australian Women's Cric...,3,3.5,Alyssa Healy and the Australian Women's Cricke...,The impact severity rating is low due to the c...,"The community is centered around Alyssa Healy,...",[{'explanation': 'Alyssa Healy is the captain ...,"{\n ""title"": ""Alyssa Healy and the Australi...",5a2b82ef-05f3-443d-a12a-16d1effbd1a7
4,1034,# Gary Wang and the FTX Collapse\n\nThe commun...,3,8.5,Gary Wang and the FTX Collapse,The impact severity rating is high due to the ...,"The community is centered around Gary Wang, co...",[{'explanation': 'Gary Wang is a central figur...,"{\n ""title"": ""Gary Wang and the FTX Collaps...",a52016ae-19a6-4e5e-a048-425c76d968d8


#### Read text units

In [8]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 3143


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,91463b5c2ee8f09bd80669546924f65b,"already met with Mets Date: Wednesday, Dec. ...",600,[00ab35021200ff5c0bf9fd89114d9219],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[56397382f94943eba476308c16c456a2, 12faf9ade3f..."
1,5380dc0fda09ef750b04aa4f65e16455,. Pair of pitchers could be signed after Ohta...,600,[00ab35021200ff5c0bf9fd89114d9219],"[4119fd06010c494caa07f439b333f4c5, f7e11b0e297...","[0cd31d6520d1420ca48c7e7d81cf35fe, 2c2c067058b..."
2,123f6b4bf506c6de1fa09d4bb1a40695,"the team's complex in Florida on Monday, far ...",600,[00ab35021200ff5c0bf9fd89114d9219],"[f7e11b0e297a44a896dc67928368f600, 1fd3fa8bb5a...","[d31fd009f3b6489b85314b9e1596e17c, a5c6725e60a..."
3,81861f76bdd8fe026392be2adac3aa88,ace Dylan Cease in trade talks. MLB Network's...,600,[00ab35021200ff5c0bf9fd89114d9219],"[f7e11b0e297a44a896dc67928368f600, 1fd3fa8bb5a...","[2a2bbdbafafa4081a819ed90fca77ec1, d7a547c277f..."
4,0ad0a45749e00c2c3aec8822d0b21a56,never saw anyone under distress and “would ne...,600,[010db9ec0dd1907e0e5515cae7f6c77c],"[e2bf260115514fb3b252fd879fb3e7be, b462b94ce47...","[4fe8d551ed224625b0ac2b257da706b9, ebf2bf6eaea..."


In [9]:
print(len(entities))
filtered_entities = list(filter(lambda x: x.description != '', entities))
print(len(filtered_entities))

7147
6969


### Convert .parquet file to .csv
This is for viewing the data files

In [10]:
def convert_parquet_to_csv(source_dir, target_dir):
    # 确保目标目录存在
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # 遍历源目录中的所有文件
    for file_name in os.listdir(source_dir):
        if file_name.endswith('.parquet'):
            # 构建完整的文件路径
            file_path = os.path.join(source_dir, file_name)
            # 读取 Parquet 文件
            df = pd.read_parquet(file_path)
            
            # 构建目标文件路径
            target_file_path = os.path.join(target_dir, file_name.replace('.parquet', '.csv'))
            # 保存为 CSV 文件
            df.to_csv(target_file_path, index=False)
            print(f'Converted {file_path} to {target_file_path}')


source_directory = INPUT_DIR 
target_directory = f'{INPUT_DIR}/convert_csv'
convert_parquet_to_csv(source_directory, target_directory)

Converted ../output/20240823-142207/artifacts\create_base_documents.parquet to ../output/20240823-142207/artifacts/convert_csv\create_base_documents.csv
Converted ../output/20240823-142207/artifacts\create_base_entity_graph.parquet to ../output/20240823-142207/artifacts/convert_csv\create_base_entity_graph.csv
Converted ../output/20240823-142207/artifacts\create_base_extracted_entities.parquet to ../output/20240823-142207/artifacts/convert_csv\create_base_extracted_entities.csv
Converted ../output/20240823-142207/artifacts\create_base_text_units.parquet to ../output/20240823-142207/artifacts/convert_csv\create_base_text_units.csv
Converted ../output/20240823-142207/artifacts\create_final_communities.parquet to ../output/20240823-142207/artifacts/convert_csv\create_final_communities.csv
Converted ../output/20240823-142207/artifacts\create_final_community_reports.parquet to ../output/20240823-142207/artifacts/convert_csv\create_final_community_reports.csv
Converted ../output/20240823-142

### Initialize LLM and embedding setting

In [13]:
api_key = os.environ["GRAPHRAG_API_KEY"]
# llm_model = 'qwen2-instruct'
llm_model = 'llama-3.1-instruct'
embedding_model ='bge-m3'
model_base_url = 'http://10.1.3.6:9997/v1'
# model_base_url = 'http://api-gw.motiong.net:5000/api/openai/ve/v1'
embed_base_url = 'http://10.4.32.2:9997/v1'

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
    api_base=model_base_url
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=embed_base_url,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    # deployment_name=embedding_model,
    max_retries=20,
)

In [14]:
llm.generate(messages=[{'role': 'user', 'content': 'what is your model name?'}])

'I\'m an AI designed to assist and communicate with users in a helpful and informative way. My model name is Llama. Llama stands for "Large Language Model Meta AI."'

In [15]:
text_embedder.embed('hi')

[-0.05364641547203064,
 0.005159882362931967,
 -0.03156302496790886,
 -0.002288772724568844,
 -0.01872427575290203,
 -0.03881914168596268,
 -0.019673757255077362,
 0.014943007379770279,
 0.0025652172043919563,
 0.008873015642166138,
 -0.010606706142425537,
 -0.0006738868542015553,
 0.0034524051006883383,
 -0.000930049573071301,
 -0.025882208719849586,
 -0.03816811367869377,
 0.018098585307598114,
 -0.031044377014040947,
 -0.005289658438414335,
 -0.010522020980715752,
 -0.04967357963323593,
 0.003503941697999835,
 -0.018531840294599533,
 0.0048715053126215935,
 0.0231167059391737,
 0.031191039830446243,
 -0.03719362989068031,
 0.014309031888842583,
 -0.026341594755649567,
 -0.02967548556625843,
 0.01192169077694416,
 0.038913141936063766,
 0.008396611548960209,
 -0.06410181522369385,
 -0.020161662250757217,
 -0.024783188477158546,
 -0.0283959973603487,
 -0.054738808423280716,
 -0.055360037833452225,
 0.02129525877535343,
 -0.0064514512196183205,
 0.025193210691213608,
 0.005739246029406

### Create local search context builder

In [17]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [18]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 15,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 18_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    system_prompt=EXPLORE_MULTIHOP_LOCAL_SEARCH_SYS_PROMPT,
    # response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### create local explore engine

In [19]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from explore_graph.explore import LocalExplore


explore_context_params = local_context_params

local_explorer = LocalExplore(
    search_engine=search_engine,
    llm=llm,
    context_builder=context_builder,
    explore_context_params= explore_context_params
)

### Batch run Graph RAG local search on set of questions

In [26]:
from utils import read_questions, create_empty_csv_file, save_answer_data
import time
import json

# user inputs
topic = 'multi_hop' # dataset name
sub_dir = '/without_relationships' 
QUESTION_FILE = '../test/questions/multi_hop_questions.txt'
resume = True # set true if resume from previous test

result_file = f'../test_results/{topic}{sub_dir}/GraphRAG_results.csv'
columns = ['Question', 'Graph RAG answer']
with_reason = bool(sub_dir == '/with_reason')

questions = read_questions(QUESTION_FILE)

# read previous process if resume
start_index = 0
if not resume:
    create_empty_csv_file(result_file, columns)
else:
    df = pd.read_csv(result_file)
    start_index = len(df)
    print(f'resume from question {start_index + 1}')


start_time = time.time()
print(f'using LLM model: {llm_model}')
# query each question and record the answer to csv file
for i, question in enumerate(questions[start_index:]):
    result = await search_engine.asearch(question)
    answer = result.response
    print(f'{i+1+start_index}/{len(questions)} ### A: {answer}')
    if with_reason:
        parsed_answer = json.loads(answer)
        answer = parsed_answer['answer']

    save_answer_data(data=[question, answer], column_names=columns, filename=result_file)

end_time = time.time()
duration = end_time - start_time
avg_time = duration / len(questions[start_index:])
print(f'total time: {duration}s')
print(f'avg time: {avg_time}s')

resume from question 101
using LLM model: llama-3.1-instruct


ZeroDivisionError: float division by zero

### Explore Graph on sample question

In [23]:
question = questions[14]
print(f'Q: {question}')
response = await local_explorer.explore(query=question)
print('A:', response.response)


Q: Does 'The Independent - Life and Style' article suggesting Prince William's emotional state regarding Princess Diana's death align with the same publication's depiction of the events leading up to her death in 'The Crown season six'?
iteration: 1
answer: Insufficient information
entities to explore: ['BEATS FIT PRO', 'APPLE', 'X (FORMERLY TWITTER)', 'SAFARI', 'THE CROWN', 'ONLINE SAFETY ACT']
summary: There is no information in the provided data that suggests 'The Independent - Life and Style' has published an article about Prince William's emotional state regarding Princess Diana's death. Additionally, the data does not provide any information about the events leading up to Princess Diana's death in 'The Crown season six'. Therefore, it is not possible to determine if the two align.

iteration: 2
answer: Insufficient information
entities to explore: ['THE INDEPENDENT - LIFE AND STYLE', 'WILLIAM', 'DIANA', 'THE CROWN', 'ONLINE SAFETY ACT', 'X (FORMERLY TWITTER)', 'MONSTER HUNTER NOW

In [25]:
print(response.context_data)

-----Entities-----
id|entity|description|number of relationships
1790|THE INDEPENDENT - LIFE AND STYLE|The Independent - Life and Style is a comprehensive media outlet that covers a wide range of topics related to lifestyle, relationships, and entertainment. In 2023, the publication has been particularly active in providing insights into celebrity relationships, featuring a detailed timeline of Will Smith and Jada Pinkett Smith's relationship on October 12th and reporting on Will Smith's comments about his wife's memoir and their marriage dynamics on October 16th. The outlet also delves into practical relationship advice, as evidenced by an article published on December 12th, which offers guidance on introducing a partner to family during the holiday season, drawing from the expertise of relationship specialists. On the entertainment front, The Independent - Life and Style covered a unique story on October 10th, about Martin Scorsese engaging in an internet slang quiz with his daughter

### Run batch questions on explore graph

In [22]:
from utils import read_questions, create_empty_csv_file, save_answer_data
import time

# user inputs
topic = 'multi_hop'
sub_dir = '/graph_explore'
QUESTION_FILE = '../test/questions/multi_hop_questions.txt'
resume = True

result_file = f'../test_results/{topic}{sub_dir}/GraphRAG_results.csv'
columns = ['Question', 'Graph RAG answer', 'Iteration', 'time']

questions = read_questions(QUESTION_FILE)

start_index = 0
if not resume:
    create_empty_csv_file(result_file, columns)
else:
    df = pd.read_csv(result_file)
    start_index = len(df)
    print(f'resume from question {start_index + 1}')


start_time = time.time()
print(f'using LLM model: {llm_model}')

for i, question in enumerate(questions[start_index:]):
    result = await local_explorer.explore(question)
    answer = result.response
    print(f'{i+1+start_index}/{len(questions)} ### A: {answer} ### {result.num_iter}')

    save_answer_data(
        data=[question, answer, result.num_iter, result.completion_time], 
        column_names=columns,
        filename=result_file)

end_time = time.time()
duration = end_time - start_time
avg_time = duration / len(questions[start_index:])
print(f'total time: {duration}s')
print(f'avg time: {avg_time}s')

resume from question 63
using LLM model: llama-3.1-instruct
63/100 ### A: No ### 0
64/100 ### A: Meta ### 0
65/100 ### A: Sam Bankman-Fried ### 0
66/100 ### A: Google ### 0
67/100 ### A: Apple ### 0
68/100 ### A: bettors ### 0
69/100 ### A: Spotify ### 0
70/100 ### A: Yes ### 0
71/100 ### A: Amazon ### 0
72/100 ### A: Yes ### 0
iteration: 1
answer: Insufficient information
entities to explore: []
summary: No, the Yardbarker article did not describe Alex Verdugo's performance, and the Sporting News article did not mention Christian McCaffrey not scoring in the context of the San Francisco 49ers' offensive performance.

73/100 ### A: Insufficient information ### 1
iteration: 1
answer: Insufficient information
entities to explore: []
summary: No, the provided information does not contain the specific details about the Minnesota Vikings' passing play percentage in Week 4 from CBSSports.com, nor does it provide a direct comparison between Josh Dobbs' and Kirk Cousins' leadership from the Sp

### run on one sample question with Graph RAG local search

In [33]:
question = questions[10]
print("Q:", question)
result = await search_engine.asearch(question)
print('A:', result.response)

Q: Considering the information from a BBC article detailing Sridevi's achievements in the Indian film industry and a Times of India report on her posthumous honors, which single character from a film portrayed by Sridevi has been recognized for its cultural impact and has also been commemorated with a special award after her passing?
A: 


In [18]:
print(result.context_text)

-----Entities-----
id|entity|description|number of relationships
5974|BAFTA|BAFTA, or the British Academy of Film and Television Arts, recently honored Owen Teale for his outstanding performance in the film Dream Horse, as reported by The Sydney Morning Herald on 2023-10-20. Despite the recognition, Teale expressed his belief that co-star Toni Collette was equally deserving of the award, showcasing the humility and respect for fellow actors that often characterizes the BAFTA community. This acknowledgment highlights the collaborative and appreciative nature of the entertainment industry, where talent and hard work are celebrated, even when accolades are bestowed upon individual performances.|1
6110|IRANIAN WOMEN|Iranian women, a group whose resilience and strength have been prominently showcased in the film Shayda, directed by Noora Niasari, are at the heart of renewed international attention as of October 1, 2023. Niasari, through her cinematic work, aims to catalyze a global conversa

#### Inspecting the context data used to generate the response

In [19]:
print(f'number of records: {len(result.context_data["entities"])}')
result.context_data["entities"][:50]

number of records: 40


Unnamed: 0,id,entity,description,number of relationships,in_context
0,5974,BAFTA,"BAFTA, or the British Academy of Film and Tele...",1,True
1,6110,IRANIAN WOMEN,"Iranian women, a group whose resilience and st...",2,True
2,901,MARIA SHARAPOVA,"Maria Sharapova, a name synonymous with grace ...",2,True
3,4156,RITCHIE VALENS,"Ritchie Valens, a renowned artist in the music...",1,True
4,1328,SPIDER-MAN,"Spider-Man, a fictional superhero and a key Ma...",7,True
5,5608,RAVINDRA JADEJA,Ravindra Jadeja is an Indian cricketer who has...,0,True
6,6111,PERSIAN NEW YEAR,Persian New Year is a cultural event that is f...,1,True
7,237,EEGA,'Eega' is a movie from 2012 directed by S.S. R...,2,True
8,6108,SHAYDA,"""Shayda, directed by Noora Niasari, is a compe...",4,True
9,2040,THE DEPARTED,"""The Departed"" is a film by Martin Scorsese th...",1,True


In [20]:
print(f'number of records: {len(result.context_data["relationships"])}')
result.context_data["relationships"].head(50)

number of records: 66


Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,1837,MARVEL,SPIDER-MAN,Spider-Man is a character created by Marvel. [...,1.0,9,1,True
1,6862,SHAYDA,IRANIAN WOMEN,"Shayda, a compelling film, has garnered intern...",2.0,6,2,True
2,6861,SHAYDA,NOORA NIASARI,"Noora Niasari, a skilled and thoughtful direct...",2.0,5,1,True
3,6863,SHAYDA,PERSIAN NEW YEAR,The film Shayda features the Persian New Year ...,1.0,5,2,True
4,6756,TONI COLLETTE,DREAM HORSE,Toni Collette starred in the 2020 movie Dream ...,1.0,4,1,True
5,728,EEGA,S.S. RAJAMOULI,'Eega' is a movie directed by S.S. Rajamouli [...,1.0,3,1,True
6,4051,QUENTIN TARANTINO,MARTIN SCORSESE,"In the dynamic landscape of filmmaking, Quenti...",2.0,16,4,True
7,4055,MARTIN SCORSESE,THE DEPARTED,"Martin Scorsese's film ""The Departed"" is refer...",1.0,15,4,True
8,4056,MARTIN SCORSESE,THE WOLF OF WALL STREET,"Martin Scorsese, a renowned filmmaker, directe...",2.0,15,4,True
9,4060,MARTIN SCORSESE,LILY GLADSTONE,Lily Gladstone will appear in Martin Scorsese'...,1.0,15,4,True


In [24]:
result.context_data["reports"].head()

KeyError: 'reports'

In [21]:
result.context_data["sources"].head(50)

Unnamed: 0,id,text
0,1344,the film]. You can feel the joyous moments ev...
1,1343,"and process it,” she says. “But the thing is..."
2,1305,"“Toni just raised my game,” he says. Credit: ..."
3,695,"Happy Friday, Polygon readers! Each week, we r..."
4,631,"have appeared on floats, but only Spider-Man ..."
5,389,many viewings of and conversations about Kill...
6,387,"In 2021, Quentin Tarantino defended his intent..."
7,1439,Vampire fiction has a long and distinguished p...
8,781,"eyes suddenly lit up. “Oh, really? We never u..."
9,1304,troubled him that this person who had committ...


In [None]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [None]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [None]:
question_history = []
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=50
)
print(*candidate_questions.response, sep='\n')