# Setup

In [1]:
import pandas as pd
import numpy as np
import os
#from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

In [2]:
## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

# Load Documents

Load

In [3]:
## PDF Loader
# loader = PyPDFDirectoryLoader(Path(f"./data_input"))

## File txt Loader
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


定義分割器

In [5]:
'''
chunk_size：每個段落的最大字數
chunk_overlap：每個段落的重疊字數，用於保持上下文連貫
length_function：用於計算文本長度的函數，這裡使用 Python 的內建函數 len
is_separator_regex：檢測段落分隔符的正則表達式，這裡設置為 False，表示不使用正則表達式，而是使用固定的 chunk_size 和 chunk_overlap 來分割文本
'''
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)


分割文件

In [6]:
pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))

print(pages[0].page_content)

Number of chunks =  3
In a village on a hillside lived a boy named Peter, whose parents were sheep farmers. Everybody in the village had to take turns looking after the sheep. At 10, Peter was regarded as old enough to take his turn at shepherding.

However, Peter found it very dull on the hillside with only sheep for company. So he’d look for ways to amuse himself: running up rocks, climbing trees and even chasing sheep. Unfortunately, he still felt bored. Then a brilliant idea crossed his mind. He climbed to the top of the tallest tree, shouting towards the village: “Wolf! Wolf! Wooolf! Woohoolf!”

One of the villagers heard him and informed all the other people, who got together and armed themselves with axes and hoes. They ran out of the village to chase away the wolf and save their herd. However, when they got there, all they found was Peter perched high up in his tree with the sheep grazing peacefully nearby. Everyone was very annoyed with him.

For a while life went on again as 

# Create a dataframe of all the chunks

In [7]:
from helpers.df_helpers import documents2Dataframe

In [8]:
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(3, 3)


Unnamed: 0,text,source,chunk_id
0,In a village on a hillside lived a boy named P...,data_input\cureus\cureus-0015-00000040274.txt,1eaefb29ecfb40cc82c4afbffcb334d6
1,"Sure enough, somebody in the village heard, an...",data_input\cureus\cureus-0015-00000040274.txt,0196b7c7a6e34038aff70f7476f3429b
2,"This time, nobody believed that there was real...",data_input\cureus\cureus-0015-00000040274.txt,a3763b0c0dd84c3abe7346c9617d2551


## Extract Concepts

In [9]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [10]:
## To regenerate the graph with LLM, set this to True
#regenerate = False
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph_1.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks_1.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

[
   {
       "node_1": "Peter",
       "node_2": "Boredom",
       "edge": "At a young age, Peter found shepherding to be dull and uninteresting. He struggled with finding ways to amuse himself while on duty."
   },
   {
       "node_1": "Peter",
       "node_2": "Creativity",
       "edge": "In order to escape boredom, Peter came up with innovative ways to entertain himself while on shepherding duty."
   },
   {
       "node_1": "Peter",
       "node_2": "Ambiguity",
       "edge": "Initially, when villagers heard Peter's wolf cry, they mistakenly assumed it was a real wolf threat."
   },
   {
       "node_1": "Villagers",
       "node_2": "Irritation",
       "edge": "After the first incident, the other villagers became annoyed with Peter's false alarms."
   },
   {
       "node_1": "Peter",
       "node_2": "Attention-seeking",
       "edge": "Peter enjoyed receiving attention from his false wolf cries and continued to do so in the future."
   }
][
  {
    "node_1": "Peter",
    "n

In [11]:
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(16, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,peter,boredom,"At a young age, Peter found shepherding to be ...",1eaefb29ecfb40cc82c4afbffcb334d6,4
1,peter,creativity,"In order to escape boredom, Peter came up with...",1eaefb29ecfb40cc82c4afbffcb334d6,4
2,peter,ambiguity,"Initially, when villagers heard Peter's wolf c...",1eaefb29ecfb40cc82c4afbffcb334d6,4
3,villagers,irritation,"After the first incident, the other villagers ...",1eaefb29ecfb40cc82c4afbffcb334d6,4
4,peter,attention-seeking,Peter enjoyed receiving attention from his fal...,1eaefb29ecfb40cc82c4afbffcb334d6,4


## Calculating contextual proximity

In [12]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
110,wolf,men,"a3763b0c0dd84c3abe7346c9617d2551,a3763b0c0dd84...",2,contextual proximity
112,wolf,peter,"0196b7c7a6e34038aff70f7476f3429b,0196b7c7a6e34...",7,contextual proximity
113,wolf,sheep,"0196b7c7a6e34038aff70f7476f3429b,0196b7c7a6e34...",5,contextual proximity
115,wolf,village,"0196b7c7a6e34038aff70f7476f3429b,0196b7c7a6e34...",2,contextual proximity
117,wolf,villagers,"a3763b0c0dd84c3abe7346c9617d2551,a3763b0c0dd84...",2,contextual proximity


### Merge both the dataframes

In [13]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,ambiguity,peter,"1eaefb29ecfb40cc82c4afbffcb334d6,1eaefb29ecfb4...",contextual proximity,4
1,attention-seeking,peter,"1eaefb29ecfb40cc82c4afbffcb334d6,1eaefb29ecfb4...",contextual proximity,4
2,bell,wolf,"a3763b0c0dd84c3abe7346c9617d2551,a3763b0c0dd84...",contextual proximity,2
3,boredom,peter,"1eaefb29ecfb40cc82c4afbffcb334d6,1eaefb29ecfb4...",contextual proximity,4
4,creativity,peter,"1eaefb29ecfb40cc82c4afbffcb334d6,1eaefb29ecfb4...",contextual proximity,4
5,dead sheep's bodies,wolf,"a3763b0c0dd84c3abe7346c9617d2551,a3763b0c0dd84...",contextual proximity,2
6,irritation,peter,"1eaefb29ecfb40cc82c4afbffcb334d6,1eaefb29ecfb4...",contextual proximity,4
7,men,wolf,"a3763b0c0dd84c3abe7346c9617d2551,a3763b0c0dd84...",The men did not come out when Peter cried 'Wol...,6
8,old man,peter,"0196b7c7a6e34038aff70f7476f3429b,0196b7c7a6e34...",contextual proximity,5
9,old man,sheep,"0196b7c7a6e34038aff70f7476f3429b,0196b7c7a6e34...",contextual proximity,3


## Calculate the NetworkX Graph

In [14]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(16,)

In [15]:
import networkx as nx

In [16]:
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

In [17]:
print(G)

Graph with 16 nodes and 29 edges


### Calculate communities for coloring the nodes

In [18]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  3
[['ambiguity', 'attention-seeking', 'boredom', 'creativity', 'old man', 'peter', 'sheep', 'tree', 'village', 'villager'], ['bell', 'men', 'wolf'], ["dead sheep's bodies", 'irritation', 'villagers']]


### Create a dataframe for community colors

In [19]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,ambiguity,#57db5f,1
1,attention-seeking,#57db5f,1
2,boredom,#57db5f,1
3,creativity,#57db5f,1
4,old man,#57db5f,1
5,peter,#57db5f,1
6,sheep,#57db5f,1
7,tree,#57db5f,1
8,village,#57db5f,1
9,villager,#57db5f,1


### Add colors to the graph

In [20]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [21]:
from pyvis.network import Network

graph_output_directory = "./docs/index1.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

#net.show(graph_output_directory, notebook=False)
net.show(graph_output_directory)