## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "wind(11)"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3 if (3 < len(pages)) else (len(pages)-1)].page_content)


  0%|          | 0/1 [00:00<?, ?it/s]

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1/1 [00:07<00:00,  7.11s/it]

Number of chunks =  12
shield and his spear and went to war. He thought war was made of spears and shields and courage, and he brought them all.

But they gave him a gun, so he left the spear and the shield behind him and took the courage, and went where they sent him because they said this was his duty and he believed in duty. He believed in duty and in the kind of justice that he knew, and in all the things that were of the earth — like the voice of the forest, the right of a lion to kill a buck, the right of a buck to eat grass, and the right of a man to fight. He believed in many wives, young as he was, and in the telling of stories by the shade of the singiri.

He took the gun and held it the way they had told him to hold it, and walked where they told him to walk, smiling a little and looking for another man to fight.

He was shot and killed by the other man, who also believed in duty,

and he was buried where he fell. It was so simple and so unimportant.

But of course it meant 




## Create a dataframe of all the chunks

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(12, 3)


Unnamed: 0,text,source,chunk_id
0,"103\n\nVIII\n\nAnd We be Playmates, Thou and I...",data_input\wind(11)\wind_split_104-112.pdf,2139206e74764cc99ab62a042d9ec0d3
1,104\n\nThe Protectorate fought a frontier war ...,data_input\wind(11)\wind_split_104-112.pdf,407a1a3340bf4bf9a649dbc9762f24c8
2,"The farm lived, but its voice was a whisper. I...",data_input\wind(11)\wind_split_104-112.pdf,39c09afa0ac242deb6c924a36c91039a
3,shield and his spear and went to war. He thoug...,data_input\wind(11)\wind_split_104-112.pdf,475d4ddaa6e74513a36f59a4301256bb
4,"‘You are very selfish, Kibii,’ I said. ‘I can ...",data_input\wind(11)\wind_split_104-112.pdf,cdc42f082d764054aa0fb3b0438e7c96


## Extract Concepts

In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [5]:
## To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    print("start concepts")
    concepts_list = df2Graph(df, model='zephyr:latest')
    print("concepts done")
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(80, 6)


Unnamed: 0,node_1,node_2,edge,chunk_id,node_3,count
0,war,frontier weapons,The Protectorate fought a frontier war with fr...,7303b4d4f826473394216973b508486e,,4
1,war,men,War was different in the hinterland. It was a ...,7303b4d4f826473394216973b508486e,,4
2,east africa,lights,The lights had not only gone out 'all over Eur...,7303b4d4f826473394216973b508486e,,4
3,europe,lights,The lights had gone out 'all over Europe',7303b4d4f826473394216973b508486e,,4
4,men,frontier clothes,It was still dressed in frontier clothes.,7303b4d4f826473394216973b508486e,,4


## Calculating contextual proximity

In [6]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
1278,x,egret,"4f593de03e1440aba9b1902cedc6190f,4f593de03e144...",2,contextual proximity
1282,young girls,dance,"102340999cb54fe285789112620d96ff,102340999cb54...",5,contextual proximity
1287,young girls,leader,"102340999cb54fe285789112620d96ff,102340999cb54...",2,contextual proximity
1292,young men,dance,"102340999cb54fe285789112620d96ff,102340999cb54...",5,contextual proximity
1297,young men,leader,"102340999cb54fe285789112620d96ff,102340999cb54...",2,contextual proximity


### Merge both the dataframes

In [7]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
for row in dfg1.iterrows():
    node1 = row[1].get('node_1')
    node2 = row[1].get('node_2')
    nodes = dfg["node_1"]
    if (node2 in nodes.values) :
        entry = dfg[(dfg["node_1"] == node2) & (dfg["node_2"] == node1)]
        if (entry.empty != True) :
            edge = entry.head(1)['edge']
            index = entry.head(1).index[0]
            if (edge.values[0] == "contextual proximity") :
                if row[1].get('edge').endswith("contextual proximity") == False:
                    out = row[1].get('edge')+", contextual proximity"
                else :
                    out = row[1].get('edge')
                dfg.at[index, 'edge'] = out
                switch_drop = dfg[(dfg["node_1"] == node1) & (dfg["node_2"] == node2)].index
                dfg = dfg.drop(index=switch_drop).reset_index(drop=True)


dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,africa,music,b88faa42b6d94a4d98938a11bb431a54,The author states that the music seemed withou...,4
1,agriculture,kibii,"c722f3a284e34ea98ac51b9ea7e37bc4,c722f3a284e34...",contextual proximity,2
2,agriculture,kikuyu,"c722f3a284e34ea98ac51b9ea7e37bc4,c722f3a284e34...","The Kikuyu are primarily agriculturists, which...",2
3,animals,dance,"102340999cb54fe285789112620d96ff,102340999cb54...",contextual proximity,5
4,animals,leader,"102340999cb54fe285789112620d96ff,102340999cb54...",contextual proximity,2
...,...,...,...,...,...
478,x,egret,"4f593de03e1440aba9b1902cedc6190f,4f593de03e144...",In other scholarly expositions upon similar su...,2
479,young girls,dance,"102340999cb54fe285789112620d96ff,102340999cb54...",The feet of the dancers began their rhythmic s...,5
480,young girls,leader,"102340999cb54fe285789112620d96ff,102340999cb54...",contextual proximity,2
481,young men,dance,"102340999cb54fe285789112620d96ff,102340999cb54...",The dance changed as many times as there were ...,5


## Calculate the NetworkX Graph

In [8]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(103,)

In [9]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [10]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  15
[['africa', 'music'], ['agriculture', 'animals', 'author', "author's father", 'being a murani', 'blue starlings', 'bush-buck', 'chorus', 'dance', 'dawn', 'hero', 'hunting', 'i', 'kibii', 'kikuyu', 'leader', 'little man', 'mau forest', 'poison', 'the kikuyu dances', 'wandorobo huntsman', 'waxbills', 'wood-pigeons', 'world', 'young girls', 'young men'], ['arab maina', 'belief', 'blood', 'character', 'circumcision', 'courage', 'curdled milk', 'duty', 'earth', 'east africa', 'europe', 'father', 'friend', 'frontier clothes', 'frontier weapons', 'gun', 'important man', 'justice', 'killer', 'lights', 'men', 'murani', 'right of a lion to kill a buck', 'shields', 'shooting', 'spear', 'spears', 'unknown', 'voice of the forest', 'war'], ['black- and-white tails', 'colobus monkeys'], ['boer children', 'games', 'growing up', 'nandi games', 'old games', 'waiting', 'white child'], ['buttocks', 'skins of serval-cats'], ['chameleon', 'death', 'egret', 'first man', 'god', 'na

### Create a dataframe for community colors

In [11]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,africa,#db5f57,1
1,music,#db5f57,1
2,agriculture,#57db5f,2
3,animals,#57db5f,2
4,author,#57db5f,2
...,...,...,...
98,rattles of metal,#db5784,13
99,long plaits of hair decorated with coloured fe...,#57b9db,14
100,resplendent,#57b9db,14
101,moon,#db57b9,15


### Add colors to the graph

In [12]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [13]:
from pyvis.network import Network
dirName = f"./docs/{data_dir}"
indexOutputDirectory = Path(dirName)
if not os.path.exists(indexOutputDirectory):
        os.makedirs(indexOutputDirectory)
    
graph_output_directory = dirName + "/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/wind(11)/index.html
