### Understanding GraphRAG output

For this analysis, we will be looking at the output files generated from a previous run of GraphRAG indexing. If you want to use data from your own run, please update the "output_path" and "output_subfolder" variables accordingly.

In [None]:
import pandas as pd
import os

output_subfolder = "20240812-215728"

analysis_path = f"./../analysis/{output_subfolder}"
output_path = f"./../sample-output/output/{output_subfolder}/artifacts"
base_documents_path = f"{output_path}/create_base_documents.parquet"
base_text_units_path = f"{output_path}/create_base_text_units.parquet"
base_extracted_entities_path = f"{output_path}/create_base_extracted_entities.parquet"
base_entity_graph_path = f"{output_path}/create_base_entity_graph.parquet"

final_documents_path = f"{output_path}/create_final_documents.parquet"
final_text_units_path = f"{output_path}/create_final_text_units.parquet"
final_entities_path = f"{output_path}/create_final_entities.parquet"
final_nodes_path = f"{output_path}/create_final_nodes.parquet"
final_relationships_path = f"{output_path}/create_final_relationships.parquet"
final_covariates_path = f"{output_path}/create_final_covariates.parquet"

final_communities_path = f"{output_path}/create_final_communities.parquet"
final_community_reports_path = f"{output_path}/create_final_community_reports.parquet"

In [None]:
# Checking the base document
df_base_documents = pd.read_parquet(base_documents_path)
df_base_documents.head()

In [None]:
# Checking how many text units are created
text_units_list = df_base_documents.loc[0, 'text_units']
print(f"Total count of text units: {len(text_units_list)}")

In [None]:
# Checking the text units
df_base_text_units_path = pd.read_parquet(base_text_units_path)
df_base_text_units_path.head()

In [None]:
# Now, let's look at the entities extracted from the text units
df_base_extracted_entities_path = pd.read_parquet(base_extracted_entities_path)
df_base_extracted_entities_path.head()

In [None]:
# Interesting, the entities are saved using graphML file format. Let's extract it as save as XML file so that we can have a look at it.
base_extracted_entities = df_base_extracted_entities_path["entity_graph"]

base_extracted_entities = base_extracted_entities[0]
try:
    os.makedirs(analysis_path, exist_ok=True)
    print(f"Directory '{analysis_path}' created successfully or already exists.")
except Exception as e:
    print(f"Error creating directory '{analysis_path}': {e}")

with open(f"{analysis_path}/base_extracted_entities.xml", "w") as f:
    f.write(base_extracted_entities)

In [None]:
# Now, let's look at the base entity graph file created
df_base_entity_graph_path = pd.read_parquet(base_entity_graph_path)
df_base_entity_graph_path.head()

In [None]:
# Interesting, the base entities are saved using graphML file format. Let's extract it as save as XML file so that we can have a look at it.
base_entity_graph = df_base_entity_graph_path["clustered_graph"]

#base_extracted_entity_graph = base_entity_graph[0]

for index, value in base_entity_graph.items():
    with open(f"{analysis_path}/base_entity_graph_cg{index}.xml", "w") as f:
        f.write(value)

In [None]:
# In the base entity graph, the list of multiple descriptions are combined to a single description.
# Now, let's look at the final documents
df_final_documents = pd.read_parquet(final_documents_path)
df_final_documents.head()

In [None]:
# Checking final entities
df_final_entities = pd.read_parquet(final_entities_path)
df_final_entities.head()

In [None]:
# Checking final relationships
df_final_relationships = pd.read_parquet(final_relationships_path)
df_final_relationships.head()

In [None]:
# Checking Final text units
df_final_text_units = pd.read_parquet(final_text_units_path)
df_final_text_units.head()

In [None]:
# Nice, but what are the Covariates anyway? Let's look at it
df_final_covariates = pd.read_parquet(final_covariates_path)
df_final_covariates.head()

In [None]:
# And finally, let's look at the communities and community reports
df_final_communities = pd.read_parquet(final_communities_path)
df_final_communities.head()


In [None]:
df_final_community_reports = pd.read_parquet(final_community_reports_path)
df_final_community_reports.head()