# Setup

conda activate knowledge-graph

In [2]:
import pandas as pd
import numpy as np
import os
#from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

In [3]:
## Input data directory
data_dir = "story"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

# Load Documents

Load

In [11]:
## PDF Loader
# loader = PyPDFDirectoryLoader(Path(f"./data_input"))

## File txt Loader
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

100%|██████████| 10/10 [00:00<00:00, 29.61it/s]


定義分割器

In [12]:
'''
chunk_size：每個段落的最大字數
chunk_overlap：每個段落的重疊字數，用於保持上下文連貫
length_function：用於計算文本長度的函數，這裡使用 Python 的內建函數 len
is_separator_regex：檢測段落分隔符的正則表達式，這裡設置為 False，表示不使用正則表達式，而是使用固定的 chunk_size 和 chunk_overlap 來分割文本
'''
splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)


分割文件

In [30]:
# 初始化一個空列表，用於存所有切分后的片段
pages = []
# 定義要讀的文檔數量
num_documents = 1

# 遍歷每個文檔，分別進行切分，並將結果添加到 pages 列表
for i in range(num_documents):
    # 對單一文檔切分
    DocChunks = splitter.split_documents([documents[i]])
    print(f"Doc {i} : Number of chunks = {len(DocChunks)}")
    # 將切分後的片段添加到 pages 列表中
    pages.append(DocChunks)

Doc 0 : Number of chunks = 32


In [31]:
# pages[doc][chunks]
print(pages[0][7].page_content)

person's hand is positioned as if they are about


# Create a dataframe of all the chunks

In [32]:
from helpers.df_helpers import documents2Dataframe

In [33]:
df=[]
for page in pages:
    # 將每個文檔的片段轉換為 DataFrame，並將其添加到 pages 列表中
    page_df = documents2Dataframe(page)
    df.append(page_df)
    print(page_df.shape)

df[0].head()

(32, 3)


Unnamed: 0,text,source,chunk_id
0,The image shows a cat gazing up at a person in a,data_input\story\0.txt,354f855ab0a44d5bb8edd7c9fe1bd87d
1,up at a person in a blue shirt. The person's hand,data_input\story\0.txt,4647a23fb2be4d03b674e3a0c756b6c4
2,"The person's hand is raised in the air, and the",data_input\story\0.txt,55469b45796e445f887c00fa71cee81b
3,"in the air, and the cat's ears are perked up as",data_input\story\0.txt,0a1cbbbbc20b4630aecedc539d4d88f2
4,are perked up as if listening to a sound. The,data_input\story\0.txt,316469a1cccb4679aa9e4cf5367889e4


## Extract Concepts

In [34]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers1 import df2Graph
from helpers.df_helpers1 import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [None]:
# To regenerate the graph with LLM, set "regenerate" to True
# regenerate = False
regenerate = True

for i in range(len(df)):
    if regenerate:
        concepts_list = df2Graph(df[i], model='zephyr:latest')
        dfg1 = graph2Df(concepts_list)
        if not os.path.exists(outputdirectory):
            os.makedirs(outputdirectory)
        
        dfg1.to_csv(outputdirectory/f"graph_{i}.csv", sep="|", index=False)
        df[i].to_csv(outputdirectory/f"chunks_{i}.csv", sep="|", index=False)

[
  {
    "node_1": "cat",
    "node_2": "person",
    "edge": "gazes_at"
  }
]

Format: 
[
  {
    "source": "node_1", // can be replaced with id or index
    "target": "node_2", // can be replaced with id or index
    "label": "edge", // label for the relation between the nodes
    "weight": numeric value to indicate strength of the relationship (optional)
  }
]

In this case, the output would be:

[
  {
    "source": 0,
    "target": 1,
    "label": "gazes_at"
  }
]

Alternative format with id or index:

[
  {
    "id": "cat-ontology",
    "related_concepts": [
      {
        "id": "person-ontology",
        "relation": "gazes_at"
      }
    ]
  }
]

This would represent the same relationship between the nodes, where "cat-ontology" and "person-ontology" are the ids or indices assigned to each node in the ontology. The "related_concepts" key holds an array of related concepts for the current concept, represented by their respective ids/indices and relation keys

In [None]:
dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

In [11]:
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(26, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,peter,village,lived,abf638054f5c4e37828abc3d9570851d,4
1,peter,sheep farmers,parents,abf638054f5c4e37828abc3d9570851d,4
2,sheep,sheep farmers,belong to,abf638054f5c4e37828abc3d9570851d,4
3,peter,sheep,responsibility,abf638054f5c4e37828abc3d9570851d,4
4,village,looking after sheep,requirement,abf638054f5c4e37828abc3d9570851d,4


## Calculating contextual proximity

In [12]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
207,wolf,village,"abf638054f5c4e37828abc3d9570851d,abf638054f5c4...",10,contextual proximity
208,wolf,villagers,"abf638054f5c4e37828abc3d9570851d,abf638054f5c4...",8,contextual proximity
214,wrongdoing,peter,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...",3,contextual proximity
215,wrongdoing,sheep,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...",2,contextual proximity
217,wrongdoing,village,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...",2,contextual proximity


### Merge both the dataframes

In [13]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,another villager,peter,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...",contextual proximity,3
1,another villager,sheep,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...",contextual proximity,2
2,another villager,village,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...","living in,contextual proximity",6
3,big hairy wolf,peter,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...",contextual proximity,3
4,big hairy wolf,sheep,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...","is chasing,contextual proximity",6
...,...,...,...,...,...
149,wolf,village,"abf638054f5c4e37828abc3d9570851d,abf638054f5c4...","false presence of,contextual proximity",14
150,wolf,villagers,"a5898d327a7c416eaf751b2c2cf35c13,abf638054f5c4...",trustworthiness undermined by previous false a...,12
151,wrongdoing,peter,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...",contextual proximity,3
152,wrongdoing,sheep,"1953d0d0f3804271bd17d7bcb46b42c3,1953d0d0f3804...",contextual proximity,2


In [24]:
dfg.to_csv(outputdirectory/"graph_final.csv", sep="|", index=False)

## Calculate the NetworkX Graph

In [14]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(19,)

In [15]:
import networkx as nx

In [18]:
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

In [19]:
print(G)

Graph with 19 nodes and 77 edges


### Calculate communities for coloring the nodes

In [20]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  3
[['another villager', 'big hairy wolf', 'bleating', 'boredom', 'company', 'fields', 'looking after sheep', 'peter', "peter's trick", 'sheep', 'sheep farmers', 'sticks', 'tree', 'village', 'villagers', 'wolf', 'wrongdoing'], ['hillside'], ['old man']]


### Create a dataframe for community colors

In [21]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,another villager,#57db5f,1
1,big hairy wolf,#57db5f,1
2,bleating,#57db5f,1
3,boredom,#57db5f,1
4,company,#57db5f,1
5,fields,#57db5f,1
6,looking after sheep,#57db5f,1
7,peter,#57db5f,1
8,peter's trick,#57db5f,1
9,sheep,#57db5f,1


### Add colors to the graph

In [22]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [23]:
from pyvis.network import Network

graph_output_directory = "./docs/index2.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

#net.show(graph_output_directory, notebook=False)
net.show(graph_output_directory)