## Setup

In [1]:
#pip install langchain langchain-community

In [2]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
#data_dir = "cureus"
data_dir = 'tr'
inputdirectory = Path(f"./data_input/{data_dir}")

## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

In [3]:
inputdirectory

PosixPath('data_input/tr')

In [4]:
outputdirectory

PosixPath('data_output/tr')

## Load Documents

In [5]:
#pip install unstructured

In [6]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


100%|██████████| 1/1 [00:02<00:00,  2.39s/it]

Number of chunks =  68
system as a whole informs local events as it were.ld as an expression of divine consciousness where the of um Theory, we have been exposed to a variety of intellectual 





In [7]:
documents

[Document(page_content='Metaphysics of Awareness\r\rIn many respects, one can consider the age that we live in, the Quantum Era, as one of intense intellectual \rcrisis and turmoil, very much analogous to the crisis that the intellectuals faced during the Enlightenment Era \rafter the world had almost literally been turned upside down. We see clear indications of this crisis, \rmanifestations as it were, on the world stage with \r\ri.\tglobal warming: a threat on a scale that we as a global community have ever faced before and has the \rpotential for devastating consequences, some of which are already starting to see. Changing climate \rleading to shortages of and lack of food and water, large scale people unrest and movement leading to \rfurther political instability. ii. nuclear proliferation: an increasing number of nation states have access to, and are actively working on, \rnuclear weaponry, \riii. rise of radical extremism: aka Terrorism, which primarily is looked at through an I

## Create a dataframe of all the chunks

In [8]:
#pip install yachalk

In [9]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(68, 3)


Unnamed: 0,text,source,chunk_id
0,"Metaphysics of Awareness\r\rIn many respects, ...",data_input/tr/tr.txt,ce1d386a3e194fb3a647e9b042b48221
1,the rise of nationalism: aka populism that is ...,data_input/tr/tr.txt,d53bb76260354f97b2371261f9d56b0f
2,"If we take them all together then, given their...",data_input/tr/tr.txt,f78ee4b81d7d447e95185a4c3fdf2432
3,"From a pure intellectual perspective, from a m...",data_input/tr/tr.txt,568f10f17e7b40edb122de1200e7f24e
4,"We also have at the same time, due in no small...",data_input/tr/tr.txt,b998d992c22c43d9b920a21b5424e4aa


## Extract Concepts

In [10]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [11]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

[
   {
       "node_1": "Quantum Era",
       "node_2": "Intense intellectual crisis and turmoil",
       "edge": "The age we live in, characterized by advancements in quantum physics, has led to a complex understanding of the world that has left many intellectuals grappling with new concepts and ideas. This intellectual upheaval can be compared to the Enlightenment Era when society was trying to come to terms with the consequences of scientific discoveries."
   },
   {
       "node_1": "Global warming",
       "node_2": "Intense intellectual crisis and turmoil",
       "edge": "The threat of global warming, on a scale unprecedented in human history, has posed a significant challenge for the global community. This issue has led to intense debates among scientists, policymakers, and other stakeholders as they try to understand its implications and find solutions."
   },
   {
       "node_1": "Global warming",
       "node_2": "Shortages of and lack of food and water",
       "edge": "Ri

Unnamed: 0,node_1,node_2,edge,chunk_id,node_3,node_4,count
0,rise of nationalism,brexit,Brexit is an example of the rise of nationalis...,d53bb76260354f97b2371261f9d56b0f,,,4
1,rise of nationalism,trumpism,Trumpism refers to the nationalistic ideology ...,d53bb76260354f97b2371261f9d56b0f,,,4
2,rise of nationalism,populism,Nationalism is a form of populism that emphasi...,d53bb76260354f97b2371261f9d56b0f,,,4
3,increasing wealth inequality,general unrest,Wealth inequality has been linked to social un...,d53bb76260354f97b2371261f9d56b0f,,,4
4,increasing wealth inequality,broad class inequality,Wealth inequality is a major contributor to cl...,d53bb76260354f97b2371261f9d56b0f,,,4


## Calculating contextual proximity

In [12]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
3805,written word,expressive and intellectual power,"f62f8db6563747aaa0e938921f5e0018,f62f8db656374...",2,contextual proximity
3806,written word,intellectual construct,"f62f8db6563747aaa0e938921f5e0018,f62f8db656374...",2,contextual proximity
3807,written word,universe,"f62f8db6563747aaa0e938921f5e0018,f62f8db656374...",2,contextual proximity
3822,ātman,nous,"11d53ee631b148c4ba2dfe7412820c02,11d53ee631b14...",2,contextual proximity
3839,ṛta,nous,"11d53ee631b148c4ba2dfe7412820c02,11d53ee631b14...",2,contextual proximity


### Merge both the dataframes

In [13]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,a priori,assumptions,fb124986290242fca98717f4866cc82a,Kant pushed the predominant worldview into his...,4
1,a priori,experience,"fb124986290242fca98717f4866cc82a,fb12498629024...",contextual proximity,2
2,a priori knowledge,cognitive engine,"e27516625d5a428e9684ee6801adec8a,e27516625d5a4...",contextual proximity,2
3,a priori knowledge,context,"e27516625d5a428e9684ee6801adec8a,e27516625d5a4...",contextual proximity,2
4,a priori knowledge,human experience,"e27516625d5a428e9684ee6801adec8a,e27516625d5a4...",contextual proximity,2
...,...,...,...,...,...
1547,written word,intellectual construct,"f62f8db6563747aaa0e938921f5e0018,f62f8db656374...",The written word represents a fundamentally re...,6
1548,written word,universe,"f62f8db6563747aaa0e938921f5e0018,f62f8db656374...",contextual proximity,2
1549,yi,chinese philosophy,3c1d825c76c744f4958f5eee6ec5dfd8,"The yi, which is reflective of ancient Chinese...",4
1550,ātman,nous,"11d53ee631b148c4ba2dfe7412820c02,11d53ee631b14...",contextual proximity,2


## Calculate the NetworkX Graph

In [14]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(342,)

In [15]:
#pip install networks networkx

In [16]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [17]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  33
[['a priori', 'all beings, all existence', 'allegoresis', 'ancient philosophical systems', 'ancient philosophy', 'ancient theology', 'antiquity', 'aristotle', 'assumptions', 'aware', 'awareness', 'being', 'being and cognition', 'bohmian mechanics', 'brahman', 'brahmā', 'breath or spirit', 'change', 'classical concept', 'classical mechanics', 'cognition', 'cognitive experience', 'cognitive faculties', 'cognitive framework', 'cognitive process', 'comprehension', 'comprehension or understanding', 'conception of reality', 'consciousness', 'consistency', 'cosmic', 'cosmic reason', 'creation', 'current era of crisis', 'david bohm', 'de broglie-bohm theory', 'defining feature', 'definitive boundaries of knowledge', 'desire', 'dharma', 'direct experience of the ground of existence or ground of being, which is equivalent to god.', 'divine', 'divine being', 'divine intellect', 'divine intellect and the soul', 'dynamic quality', 'eastern and western philosophy', 'easte

### Create a dataframe for community colors

In [18]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,a priori,#db7757,1
1,"all beings, all existence",#db7757,1
2,allegoresis,#db7757,1
3,ancient philosophical systems,#db7757,1
4,ancient philosophy,#db7757,1
...,...,...,...
337,successfully through the crisis of our time,#57db5f,31
338,quantum,#afdb57,32
339,slits,#afdb57,32
340,supracconsciousness,#d757db,33


### Add colors to the graph

In [19]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [20]:
#pip install pyvis

In [21]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
