## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "test"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()
print(documents)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
# print(pages[3].page_content)


100%|██████████| 1/1 [00:05<00:00,  5.65s/it]

[Document(page_content="Open Access Review Article\n\nDOI: 10.7759/cureus.40274\n\nIndia’s Opportunity to Address Human Resource Challenges in Healthcare\n\nReview began 05/30/2023\n\n1 Sangeeta G. Saxena , Thomas Godfrey\n\n2\n\nReview ended 06/08/2023\n\nPublished 06/11/2023\n\n© Copyright 2023\n\n1. Public Health Sciences, Coastal Carolina University, Conway, USA 2. Public Health Sciences, Penn State College of Medicine, Hershey, USA\n\nSaxena et al. This is an open access article\n\ndistributed under the terms of the Creative\n\nCorresponding author: Sangeeta G. Saxena, sangeeta.cureus@gmail.com\n\nCommons Attribution License CC-BY 4.0.,\n\nwhich permits unrestricted use, distribution,\n\nand reproduction in any medium, provided\n\nthe original author and source are credited.\n\nAbstract India’s health indicators have improved in recent times but continue to lag behind those of its peer nations. The country with a population of 1.3 billion, has an estimated active health workers de




In [3]:
print(pages[3].page_content)

The authors describe these issues by providing an overview of the public and private sectors and the growing divide between them due to their divergent strategies, with the latter now having a booming medical tourism industry and a burgeoning number of medical schools. They identify the opportunities available within the newly created National Medical Council and the recent increase in the federal health budget [2]. The recommendations made to address the paucity of quality health personnel include the creation of transparent governance, strengthening the health infrastructure, upskilling the existing workforce, and creating partnerships with the much larger private sector. The methodology used is the READ approach [3], which is a systematic approach for document analysis in health policy research, consisting of readying one's materials, extracting the data, and analyzing it to distill the findings. An extensive literature search was performed, and 56 articles published in peer-reviewe

## Create a dataframe of all the chunks

In [4]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(6, 3)


Unnamed: 0,text,source,chunk_id
0,Open Access Review Article\n\nDOI: 10.7759/cur...,data_input/test/cureus-0015-00000040274.pdf,205d5205960646eb8771e7cb7164e11b
1,"which permits unrestricted use, distribution,\...",data_input/test/cureus-0015-00000040274.pdf,77a5fd7072dc4dbe85152a2af207964d
2,The recent increase in the federal health budg...,data_input/test/cureus-0015-00000040274.pdf,f9a2caca51ff45328935c317132602ed
3,The authors describe these issues by providing...,data_input/test/cureus-0015-00000040274.pdf,6eac34c9ed934b23b138806e9044396c
4,Review Overview of the public and private heal...,data_input/test/cureus-0015-00000040274.pdf,96c8a9aeb7bb4b0caee7773df10fec93


## Extract Concepts

In [5]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [7]:
## To regenerate the graph with LLM, set this to True
regenerate = True


if regenerate:
    concepts_list = df2Graph(df, model='qwen1.5')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

[
  {
    "node_1": "India",
    "node_2": "healthcare human resource challenges",
    "edge": "owns",
  },
  {
    "node_1": "Public Health Sciences",
    "node_2": "Coastal Carolina University, Conway",
    "edge": "owns",
  },
  {
    "node_1": "Public Health Sciences",
    "node_2": "Penn State College of Medicine, Hershey",
    "edge": "owns",
  },
  {
    "node_1": "Healthcare",
    "node_2": "human resource challenges",
    "edge": "owns",
  },
  {
    "node_1": "Sangeeta G. Saxena",
    "node_2": "commons attribution license CC-BY 4.0.",
    "edge": "owns",
  },
]


ERROR ### Here is the buggy response:  [
  {
    "node_1": "India",
    "node_2": "healthcare human resource challenges",
    "edge": "owns",
  },
  {
    "node_1": "Public Health Sciences",
    "node_2": "Coastal Carolina University, Conway",
    "edge": "owns",
  },
  {
    "node_1": "Public Health Sciences",
    "node_2": "Penn State College of Medicine, Hershey",
    "edge": "owns",
  },
  {
    "node_1": "Healt

Unnamed: 0,node_1,node_2,edge,chunk_id,relationship,count
0,public health,epidemiology/public health,working conditions and health sector reform in...,f9a2caca51ff45328935c317132602ed,,4
1,public health,health policy,"ethnic and demographic diversity in India, pub...",f9a2caca51ff45328935c317132602ed,,4
2,public health sector,private health sector,definition of each sector,6eac34c9ed934b23b138806e9044396c,,4
3,public health sector,private health sector,diverging strategies of the two sectors,6eac34c9ed934b23b138806e9044396c,dividing,4
4,public health sector,private health sector,medical tourism industry,6eac34c9ed934b23b138806e9044396c,connection,4


## Calculating contextual proximity

In [8]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
3,health policy,public health,"f9a2caca51ff45328935c317132602ed,f9a2caca51ff4...",2,contextual proximity
4,private health sector,public health sector,"6eac34c9ed934b23b138806e9044396c,6eac34c9ed934...",100,contextual proximity
5,public health,epidemiology/public health,"f9a2caca51ff45328935c317132602ed,f9a2caca51ff4...",2,contextual proximity
6,public health,health policy,"f9a2caca51ff45328935c317132602ed,f9a2caca51ff4...",2,contextual proximity
7,public health sector,private health sector,"6eac34c9ed934b23b138806e9044396c,6eac34c9ed934...",100,contextual proximity


### Merge both the dataframes

In [9]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,epidemiology/public health,public health,"f9a2caca51ff45328935c317132602ed,f9a2caca51ff4...",contextual proximity,2
1,health policy,public health,"f9a2caca51ff45328935c317132602ed,f9a2caca51ff4...",contextual proximity,2
2,private health sector,public health sector,"6eac34c9ed934b23b138806e9044396c,6eac34c9ed934...",contextual proximity,100
3,public health,epidemiology/public health,"f9a2caca51ff45328935c317132602ed,f9a2caca51ff4...",working conditions and health sector reform in...,6
4,public health,health policy,"f9a2caca51ff45328935c317132602ed,f9a2caca51ff4...","ethnic and demographic diversity in India, pub...",6
5,public health sector,private health sector,"6eac34c9ed934b23b138806e9044396c,6eac34c9ed934...","definition of each sector,diverging strategies...",140


## Calculate the NetworkX Graph

In [10]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(5,)

In [11]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [12]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  4
[['epidemiology/public health'], ['health policy'], ['private health sector', 'public health sector'], ['public health']]


### Create a dataframe for community colors

In [14]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,epidemiology/public health,#db5f57,1
1,health policy,#a157db,2
2,private health sector,#91db57,3
3,public health sector,#91db57,3
4,public health,#57d3db,4


### Add colors to the graph

In [15]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [16]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
