## Setup

In [1]:
import pandas as pd
import numpy as np
import os
os.environ['NLTK_DATA'] = '/Users/shu/Desktop/shuyherecode/llmkg/knowledge_graph/nltk_data'

from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "poisionrag"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

In [2]:

import nltk
nltk.data.path

['/Users/shu/Desktop/shuyherecode/llmkg/knowledge_graph/nltk_data',
 '/Users/shu/nltk_data',
 '/Users/shu/opt/anaconda3/envs/langchain/nltk_data',
 '/Users/shu/opt/anaconda3/envs/langchain/share/nltk_data',
 '/Users/shu/opt/anaconda3/envs/langchain/lib/nltk_data',
 '/usr/share/nltk_data',
 '/usr/local/share/nltk_data',
 '/usr/lib/nltk_data',
 '/usr/local/lib/nltk_data']

## Load Documents

In [3]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")

loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()
print(documents)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
# print(pages[3].page_content)


100%|██████████| 1/1 [00:03<00:00,  3.22s/it]

[Document(page_content='Tim Cook is an American business executive who has been the Chief Executive Officer (CEO) of Apple Inc. since August 2011. He succeeded Steve Jobs, the company\'s co-founder and previous CEO, who resigned due to health issues and passed away in October 2011. Born on November 1, 1960, in Mobile, Alabama, Cook grew up in nearby Robertsdale and pursued a Bachelor of Science degree in Industrial Engineering from Auburn University in 1982 and an MBA from Duke University\'s Fuqua School of Business in 1988.\n\nBefore joining Apple, Tim Cook held several important positions at various companies. He served as the Director of North American Fulfillment at IBM for 12 years, where he significantly contributed to the company\'s logistics and operations. Later, he worked at Intelligent Electronics as the Chief Operating Officer (COO) of the Reseller Division and then at Compaq as the Vice President of Corporate Materials for about six months before being recruited by Steve J




In [4]:
print(pages[3].page_content)

DALL·E: A neural network-based model that can generate images from textual descriptions, demonstrating an advanced understanding of both language and visual concepts.

Codex: This AI system is capable of understanding and generating human-like code, powering tools like GitHub Copilot, which assists developers by suggesting code snippets and entire functions based on the context of the existing code.

Robotics: OpenAI has also conducted research in robotics, working on AI systems that can learn to manipulate objects with a level of dexterity previously thought to be exclusive to humans.

OpenAI has been at the forefront of ethical considerations in AI development, advocating for safety, transparency, and regulatory measures to ensure that AI technologies are deployed responsibly and without causing harm. The organization publishes research papers, releases software, and collaborates with other entities in the AI community to advance the field in a direction that is safe and beneficial f

## Create a dataframe of all the chunks

In [5]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(4, 3)


Unnamed: 0,text,source,chunk_id
0,Tim Cook is an American business executive who...,data_input/poisionrag/test.txt,d2db081da09743fcb7827e1700933507
1,"As CEO, Tim Cook has overseen the launch of ma...",data_input/poisionrag/test.txt,5de5fdde12f745b0b7b14a90b837b22c
2,OpenAI is an artificial intelligence (AI) rese...,data_input/poisionrag/test.txt,08d3ccc42d4a44438ef62efc99c5039e
3,DALL·E: A neural network-based model that can ...,data_input/poisionrag/test.txt,7953803a1e5e41808226200daedda801


## Extract Concepts

In [6]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [7]:
## To regenerate the graph with LLM, set this to True
regenerate = True


if regenerate:
    concepts_list = df2Graph(df, model='gpt-3.5-turbo-16k')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(61, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,tim cook,american business executive,Tim Cook is an American business executive,d2db081da09743fcb7827e1700933507,4
1,tim cook,ceo,Tim Cook has been the Chief Executive Officer ...,d2db081da09743fcb7827e1700933507,4
2,tim cook,apple inc.,Tim Cook has been the CEO of Apple Inc. since ...,d2db081da09743fcb7827e1700933507,4
3,tim cook,steve jobs,Tim Cook succeeded Steve Jobs as the CEO of Ap...,d2db081da09743fcb7827e1700933507,4
4,tim cook,ibm,Tim Cook served as the Director of North Ameri...,d2db081da09743fcb7827e1700933507,4


## Calculating contextual proximity

In [8]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
1109,tim cook,supplier responsibility programs,"d2db081da09743fcb7827e1700933507,d2db081da0974...",12,contextual proximity
1110,tim cook,teamwork,"5de5fdde12f745b0b7b14a90b837b22c,5de5fdde12f74...",23,contextual proximity
1111,tim cook,transparency,"5de5fdde12f745b0b7b14a90b837b22c,5de5fdde12f74...",23,contextual proximity
1134,transparency,tim cook,"5de5fdde12f745b0b7b14a90b837b22c,5de5fdde12f74...",23,contextual proximity
1147,wojciech zaremba,openai,"08d3ccc42d4a44438ef62efc99c5039e,08d3ccc42d4a4...",17,contextual proximity


### Merge both the dataframes

In [9]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,ai system,ceo,"7953803a1e5e41808226200daedda801,7953803a1e5e4...",contextual proximity,2
1,ai system,co-founder,"7953803a1e5e41808226200daedda801,7953803a1e5e4...",contextual proximity,2
2,ai system,codex,"7953803a1e5e41808226200daedda801,7953803a1e5e4...",contextual proximity,2
3,ai system,dall·e,"7953803a1e5e41808226200daedda801,7953803a1e5e4...",contextual proximity,2
4,ai system,ethical considerations,"7953803a1e5e41808226200daedda801,7953803a1e5e4...",contextual proximity,2
...,...,...,...,...,...
162,tim cook,supplier responsibility programs,"d2db081da09743fcb7827e1700933507,d2db081da0974...",Tim Cook led the company's supplier responsibi...,16
163,tim cook,teamwork,"5de5fdde12f745b0b7b14a90b837b22c,5de5fdde12f74...","Tim Cook emphasizes teamwork,contextual proximity",27
164,tim cook,transparency,"5de5fdde12f745b0b7b14a90b837b22c,5de5fdde12f74...","Tim Cook emphasizes transparency,contextual pr...",27
165,transparency,tim cook,"5de5fdde12f745b0b7b14a90b837b22c,5de5fdde12f74...",contextual proximity,23


## Calculate the NetworkX Graph

In [10]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(60,)

In [12]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [13]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  3
[['ai system', 'artificial general intelligence (agi)', 'artificial intelligence (ai)', 'capped profit model', 'co-founder', 'codex', 'dall·e', 'digital intelligence', 'elon musk', 'ethical considerations', 'friendly ai', 'github copilot', 'gpt (generative pretrained transformer) series', 'gpt-3', 'greg brockman', 'humanity', 'ilya sutskever', 'john schulman', 'neural network-based model', 'openai', 'openai inc', 'openai lp', 'organization', 'research papers', 'sam altman', 'software', 'wojciech zaremba'], ['american business executive', 'apple culture', 'apple inc.', 'apple music', 'apple pay', 'apple tv+', 'apple watch', 'carbon footprint', 'compaq', 'coo', 'encryption', 'environmental sustainability', 'equality', 'human rights', 'ibm', 'intelligent electronics', 'iphone x', 'leadership style', 'lgbtq+ representation', 'macintosh division', 'personal values', 'privacy', 'renewable energy', 'senior vice president for worldwide operations', 'social issues', '

### Create a dataframe for community colors

In [15]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,ai system,#db5f57,1
1,artificial general intelligence (agi),#db5f57,1
2,artificial intelligence (ai),#db5f57,1
3,capped profit model,#db5f57,1
4,co-founder,#db5f57,1
5,codex,#db5f57,1
6,dall·e,#db5f57,1
7,digital intelligence,#db5f57,1
8,elon musk,#db5f57,1
9,ethical considerations,#db5f57,1


### Add colors to the graph

In [16]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [18]:
from pyvis.network import Network

graph_output_directory = "./docs/index—openaiceo.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index—openaiceo.html
