In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random
import networkx as nx
import seaborn as sns
from pyvis.network import Network
import uuid
import ollama.client as client


In [2]:
input_path = Path(f"./input_data")
output_path = Path(f"./output_data")


In [3]:

def graphPrompt(input: str, metadata={}, model="mistral-openorca:latest"):
    if model == None:
        model = "mistral-openorca:latest"

    # model_info = client.show(model_name=model)
    # print( chalk.blue(model_info))

    SYS_PROMPT = (
        "You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
            "\tTerms may include object, entity, location, organization, person, \n"
            "\tcondition, acronym, documents, service, concept, etc.\n"
            "\tTerms should be as atomistic as possible\n\n"
        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
            "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
            "\tTerms can be related to many other terms\n\n"
        "Thought 3: Find out the relation between each such related pair of terms. \n\n"
        "Format your output as a list of json. Each element of the list contains a pair of terms"
        "and the relation between them, like the follwing: \n"
        "[\n"
        "   {\n"
        '       "node_1": "A concept from extracted ontology",\n'
        '       "node_2": "A related concept from extracted ontology",\n'
        '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n'
        "   }, {...}\n"
        "]"
    )

    USER_PROMPT = f"context: ```{input}``` \n\n output: "
    response, _ = client.generate(model_name=model, system=SYS_PROMPT, prompt=USER_PROMPT)
    # print("\n\n\nType of Response",type(response))
    # print("\n\n\nType of Response",type(response[0]))
    try:
        result = json.loads(response[0])
        result = [dict(item, **metadata) for item in result]
    except:
        print("\n\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        result = None
    return result


def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
    )
    # invalid json results in NaN
    # print("\n\n\nD2GRAPH Result after graphprompt",results)
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list


def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

def documents_to_dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df




In [4]:
loader = DirectoryLoader(input_path, show_progress=True)
data_documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,  # Length of each chunk
    chunk_overlap=200,  # Overlap between chunks
    length_function=len,
    is_separator_regex=False,
)

text_chunks = text_splitter.split_documents(data_documents)
print(f"Total chunks generated: {len(text_chunks)}")
if len(text_chunks) > 3:
    print("Sample chunk content:", text_chunks[3].page_content)
    

 50%|██████████████████████████████████████████████████                                                  | 1/2 [00:10<00:10, 10.63s/it]

Total chunks generated: 29
Sample chunk content: Introduction And Background India’s health indicators have improved in recent times but continue to lag behind those of its peer nations. The country has an estimated active health workers’ density much lower than the WHO recommended thresholds [1]. The issue is compounded by the skewed inter-state, urban-rural, and public-private sector divide. The paucity of skilled personnel is a multi-factorial issue and needs to be addressed if India is to accelerate its progress toward achieving universal health coverage and its sustainable development goals (SDGs). The authors describe these issues by providing an overview of the public and private sectors and the growing divide between them due to their divergent strategies, with the latter now having a booming medical tourism industry and a burgeoning number of medical schools. They identify the opportunities available within the newly created National Medical Council and the recent increase in 




In [5]:
dataframe_chunks = documents_to_dataframe(text_chunks)
print(f"DataFrame shape: {dataframe_chunks.shape}")
print(dataframe_chunks.head())


DataFrame shape: (29, 3)
                                                text  \
0  Abstract India’s health indicators have improv...   
1  analyze data, distill findings (READ) approach...   
2  Categories: Public Health, Epidemiology/Public...   
3  Introduction And Background India’s health ind...   
4  made to address the paucity of quality health ...   

                             source                          chunk_id  
0  input_data/Input_Health_data.txt  a385297388cd4951a66335992ec9ff54  
1  input_data/Input_Health_data.txt  7f0d78974f5d4c49a724215e69f1f170  
2  input_data/Input_Health_data.txt  a69bb00de768469a9101c5b11f420657  
3  input_data/Input_Health_data.txt  e5bd3170d0c54d26868cf2d4823be15a  
4  input_data/Input_Health_data.txt  82b23693e48c4257887f12706d43b3d6  


In [None]:
# Set regenerate to True if you want to re-run the graph generation process
rebuild_graph = True

if rebuild_graph:
    # Extract nodes and edges using the LLM
    concepts = df2Graph(dataframe_chunks, model='zephyr:latest')
    relation_df = graph2Df(concepts)

    # Create output directory if not already present
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Save the DataFrames for future use
    relation_df.to_csv(output_path / "graph.csv", sep="|", index=False)
    dataframe_chunks.to_csv(output_path / "chunks.csv", sep="|", index=False)
else:
    # Load previously saved data
    relation_df = pd.read_csv(output_path / "graph.csv", sep="|")

# Data cleaning and filtering
relation_df.replace("", np.nan, inplace=True)
relation_df.dropna(subset=["node_1", "node_2", "edge"], inplace=True)
relation_df["weight"] = 4  # Assign higher weights to direct relationships

print("Processed relationships shape: ", relation_df.shape)
print(relation_df.head())


[
   {
       "node_1": "India's health indicators",
       "node_2": "Universal health coverage",
       "edge": "The improvement in India's health indicators is crucial for achieving universal health coverage, which is a key component of the sustainable development goals."
   }, {
       "node_1": "India's population",
       "node_2": "Health workers density",
       "edge": "With a population of 1.3 billion, India has an estimated active health workers density of doctors and nurses/midwives of 5.0 and 6.0 respectively, for 10,000 persons, which is much lower than the WHO threshold."
   }, {
       "node_1": "Health indicators",
       "node_2": "Peer nations",
       "edge": "India's health indicators continue to lag behind those of its peer nations."
   }, {
       "node_1": "Active health workers density",
       "node_2": "WHO threshold",
       "edge": "The issue is compounded by the fact that India's estimated active health workers density falls significantly below the WHO thr

In [None]:
def calculate_contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    # Reshape the DataFrame into a long format with one node per row
    melted_df = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    melted_df.drop(columns=["variable"], inplace=True)

    # Perform self-join to connect nodes within the same chunk
    proximity_df = pd.merge(melted_df, melted_df, on="chunk_id", suffixes=("_start", "_end"))
    proximity_df = proximity_df[proximity_df["node_start"] != proximity_df["node_end"]]

    # Group by node pairs and count occurrences
    proximity_df = proximity_df.groupby(["node_start", "node_end"]).agg(
        {"chunk_id": [",".join, "count"]}
    ).reset_index()

    # Rename columns and filter out weak connections
    proximity_df.columns = ["node_1", "node_2", "chunk_id", "count"]
    proximity_df["edge"] = "contextual proximity"
    proximity_df = proximity_df[proximity_df["count"] > 1]  # Drop low-frequency edges

    return proximity_df

# Calculate proximity-based edges
proximity_relations = calculate_contextual_proximity(relation_df)
print("Contextual proximity edges: ", proximity_relations.tail())


In [None]:
# Combine both types of relationships
combined_relations = pd.concat([relation_df, proximity_relations], axis=0)

# Group by node pairs to sum weights and concatenate edge labels
final_graph_df = combined_relations.groupby(["node_1", "node_2"]).agg(
    {"chunk_id": ",".join, "edge": ",".join, "count": "sum"}
).reset_index()

# Create a NetworkX graph
graph = nx.Graph()

# Add nodes and edges with attributes
for _, row in final_graph_df.iterrows():
    graph.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row["count"] / 4
    )



In [None]:
# Detect communities
community_generator = nx.community.girvan_newman(graph)
communities = sorted(map(sorted, next(community_generator)))

# Assign colors to communities
def assign_colors_to_communities(communities) -> pd.DataFrame:
    palette = sns.color_palette("hls", len(communities)).as_hex()
    random.shuffle(palette)
    color_mapping = []

    for group_id, community in enumerate(communities, start=1):
        color = palette.pop()
        for node in community:
            color_mapping.append({"node": node, "color": color, "group": group_id})
    return pd.DataFrame(color_mapping)

community_colors = assign_colors_to_communities(communities)

# Add color and size attributes to graph nodes
for _, row in community_colors.iterrows():
    graph.nodes[row["node"]]["group"] = row["group"]
    graph.nodes[row["node"]]["color"] = row["color"]
    graph.nodes[row["node"]]["size"] = graph.degree[row["node"]]



In [None]:
# Initialize PyVis graph
pyvis_graph = Network(height="900px", width="100%", notebook=False, cdn_resources="remote")

# Convert NetworkX graph to PyVis
pyvis_graph.from_nx(graph)

# Set layout and physics options
pyvis_graph.force_atlas_2based(central_gravity=0.015, gravity=-31)

# Export to HTML
output_file = "./output_data/index.html"
pyvis_graph.show(output_file)
