In [55]:
import pandas as pd
import openai
import json
import os
import chromadb
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.node_parser.extractors import KeywordExtractor, MetadataExtractor
from chromadb.utils import embedding_functions
import nltk
import neo4j
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/semantic-
[nltk_data]     server/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_ORGANIZATION_KEY = os.environ.get("OPENAI_ORGANIZATION_KEY")
openai.organization = OPENAI_ORGANIZATION_KEY
openai.api_key = OPENAI_API_KEY

In [81]:
def print_text_with_line_breaks(text, line_length=100):
    for i in range(0, len(text), line_length):
        print(text[i:i+line_length])
def split_into_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

### Graph-based Memory Structure 

In [None]:
## A structure of an event-based memory structure ##

event = {
  "title": "title",
  "summary": "summary",
  "entities": [
    {"entity_1": "name", "relation": ["relation_1", "relation_2"], "tag": "tag"},
    {"entity_2": "name", "relation": ["relation_1", "relation_2"], "tag": "tag"},
  ]
}

## The Goal of structure is to find most relevent memory given a pair of message.

## A structure of an idea-based memory structure ##



In [50]:
def get_openai_event_extraction(sentence: str):
    try:
        systemMessage = """
        Given a text, extract a following information.
        (1) events.
        (2) related named entities and it's tag.
        (3) relation of each entity to event in verb form.
        Try to extract every possible information.
        
        List of entity tags
        [ "ARTWORK", "ARTIST", "DATE", "LOCATION", "MEDIUM", "ART_STYLE", "CRITIQUE", "HISTORICAL_EVENT", "INSPIRATION", "SYMBOLISM", "DIMENSIONS", "MATERIALS", "EXHIBITION", "INTERPRETATION", "INFLUENCE", "FILM", "LEGACY", "PERSON", "ORGANIZATION", "ACCOMPOLISHMENT", "INTEREST", "LANGUAGE", "MONEY", "QUANTITY", "PRODUCT", "OBJECT", "MISC"]
        
        Return in following format (DO NOT OMIT)
        [{"title": "text", "summary": "text", "entities": [{"entity": "text", "tag": "tag", "relation": ["text", "text"]}, {"entity": "text", "tag": "tag", "relation": ["text", "text"]}]}, {"title": "text", "summary": "text", "entities": [{"entity": "text", "tag": "tag", "relation": ["text", "text"]}, {"entity": "text", "tag": "tag", "relation": ["text", "text"]}]}]
        
        If you cannot find any, return {"value": "None"}
        """
    
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": systemMessage},
                {"role": "user", "content": "In 1895, Picasso was traumatized when his seven-year-old sister, Conchita, died of diphtheria."},
                {"role": "assistant", "content": '[{"title": "Death of Conchita", "summary": "The sister of picasso died of diphtheria at age of seven.", "entities": [{"entity": "Picasso", "relation": ["traumatized"], "tag": "ARTIST"}, {"entity": "Conchita", "relation": ["died"], "tag": "PERSON"}]}]'},
                {"role": "user", "content": "In 1897, Picasso's realism began to show a Symbolist influence, for example, in a series of landscape paintings rendered in non-naturalistic violet and green tones."},
                {"role": "assistant", "content": '[{"title": "Symbolist influence on Picasso"s realism", "summary": "In 1897, Picassos realism began to show a Symbolist influence.", "entities": [{"entity": "Picasso", "relation": ["show", "influenced"]. "tag": "ARTIST"}, {"entity": "Symbolist", "relation": ["influences"], "tag": "ART_STYLE"}]}]'},
                {"role": "user", "content": sentence},
            ],
        )
        
        res = completion["choices"][0]["message"]["content"]
        res_jsonify = json.loads(res)
        if("value" in res_jsonify and res_jsonify["value"] == "None"): return []
        return(res_jsonify)
    except Exception as e:
        print(e)
        return []

In [19]:
def get_openai_coref_resolution(sentence: str):
    try:
        systemMessage = """
        Given a text, change all coreference to object it indicates on every possible word. Freely modify the tokens and infer from sentence.
        Return as a plain text.
        """
    
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": systemMessage},
                {"role": "user", "content": "After returning from his honeymoon and in need of money, Picasso started his exclusive relationship with the French-Jewish art dealer Paul Rosenberg. As part of his first duties, Rosenberg agreed to rent the couple an apartment in Paris at his own expense, which was located next to his own house. This was the start of a deep brother-like friendship between two very different men, that would last until the outbreak of World War II."},
                {"role": "assistant", "content": "After returning from Picasso's honeymoon and in need of money, Picasso started Picasso's exclusive relationship with the French-Jewish art dealer Paul Rosenberg. As part of Paul Rosenberg's first duties, Paul Rosenberg agreed to rent the couple an apartment in Paris at Paul Rosenberg's own expense, which was located next to Paul Rosenberg's own house. This was the start of a deep brother-like friendship between two very different Pablo Picasso and Paul Rosenberg, that would last until the outbreak of World War II."},
                {"role": "user", "content": sentence},
            ],
        )
        
        res = completion["choices"][0]["message"]["content"]
        return res
    except Exception as e:
        print(e)
        return ""

### Parse and Tokenize Data as DB

In [None]:
import wikipediaapi

def check_string_in_list(input_string, string_list):
    for item in string_list:
        if item in input_string:
            return True
    return False

def get_wikipedia_article_sentences(article_title):
    wiki_wiki = wikipediaapi.Wikipedia('MyProjectName (hyd6623@naver.com)', 'en')
    page = wiki_wiki.page(article_title)

    if not page.exists():
        raise ValueError("The Wikipedia page doesn't exist.")

    for section in page.sections:
        if check_string_in_list(section.title, ['See also', 'External links', 'References and sources']):
            page.sections.remove(section)

    return page.text

In [60]:
required_text = [".txt"]
reader = SimpleDirectoryReader(
    input_dir="../data/documents/kg/wikipedia", required_exts=required_text, recursive=True
)
documents = reader.load_data()
print(f"Loaded {len(documents)}")

Loaded 2


In [61]:
text_splitter = TokenTextSplitter(separator=".", chunk_size=256, chunk_overlap=32)
parser = SimpleNodeParser(text_splitter=text_splitter)
nodes = parser.get_nodes_from_documents(documents=documents)

In [62]:
print(len(nodes))

69


In [63]:
client = chromadb.PersistentClient(path="../src/representations/")
client.heartbeat()
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

In [64]:
collection = client.get_or_create_collection(name="picasso_kg_collection_wikipedia_core", embedding_function=sentence_transformer_ef)

In [65]:
for node in nodes:
    document = node.get_content()
    id = node.id_
    collection.add(
        documents=[document],
        ids=[id],
    )

In [68]:
collection = client.get_or_create_collection(name="picasso_kg_collection_wikipedia_core", embedding_function=sentence_transformer_ef)
collection.count()

69

In [75]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding

In [76]:
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

In [77]:
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)

In [78]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, storage_context=storage_context, service_context=service_context
)

In [87]:
from llama_index.indices.vector_store.retrievers import VectorIndexRetriever

retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
node_reps = retriever.retrieve("Picasso movement surrealism")

for node_rep in node_reps:
    print(node_rep)
    # print_text_with_line_breaks(node_rep.node.get_content())
    print("======")

node=TextNode(id_='76a30283-1306-4028-8a22-50d1b6387c9c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='24132cf320df0fde319e862071b31bbaf5207ce5c1b4be75f722de78287fa502', text='Les Demoiselles was reproduced for the first time in Europe in the same issue. Yet Picasso exhibited Cubist works at the first Surrealist group exhibition in 1925; the concept of \'psychic automatism in its pure state\' defined in the Manifeste du surréalisme never appealed to him entirely. He did at the time develop new imagery and formal syntax for expressing himself emotionally, "releasing the violence, the psychic fears and the eroticism that had been largely contained or sublimated since 1909", writes art historian Melissa McQuillan. Although this transition in Picasso\'s work was informed by Cubism for its spatial relations, "the fusion of ritual and abandon in the imagery recalls the primitivism of the Demoiselles and the elusive psych

In [48]:
from llama_index.embeddings import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [90]:
v1 = embed_model._get_text_embedding("Pablo Picasso")
v2 = embed_model._get_text_embedding("""
The Oxford English Dictionary derives the word straightforwardly from Old English butorflēoge, butter-fly; similar names in Old Dutch and Old High German show that the name is ancient, but modern Dutch and German use different words (vlinder and Schmetterling) and the common name often varies substantially between otherwise closely-related languages. A possible source of the name is the bright yellow male of the brimstone (Gonepteryx rhamni); another is that butterflies were on the wing in meadows during the spring and summer butter season while the grass was growing.
""")
print(v1)
print(v2)

[0.03678399324417114, 0.040112853050231934, -0.0356547050178051, -0.00607075821608305, -0.012495141476392746, -0.025410495698451996, 0.02019895240664482, -0.022528959438204765, 0.0010341688757762313, 0.042296040803194046, 0.0514521561563015, 0.03037460334599018, 0.0014998434344306588, -0.09978123754262924, -0.03404739126563072, -0.02963130921125412, -0.028802242130041122, 0.004321264103055, 0.04940218850970268, -0.024865156039595604, -0.07318923622369766, 0.0039985100738704205, 0.0532587394118309, 0.011898871511220932, 0.032137662172317505, -0.043688755482435226, 0.005282954778522253, 0.027777444571256638, -0.05560794845223427, 0.008879033848643303, 0.0022068603429943323, 0.014995310455560684, -0.04461028426885605, 0.006795097608119249, 1.4342510894493898e-06, 0.015711603686213493, -0.0014237144496291876, -0.047490835189819336, 0.09516052156686783, -0.029460003599524498, -0.002337805461138487, 0.04007486253976822, -0.02276497334241867, -0.015564501285552979, -0.008264465257525444, 0.00

In [91]:
import numpy as np

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    magnitude_a = np.linalg.norm(a)
    magnitude_b = np.linalg.norm(b)
    return dot_product / (magnitude_a * magnitude_b)

# Calculate cosine similarity
similarity_score = cosine_similarity(v1, v2)
print("Cosine Similarity:", similarity_score)

Cosine Similarity: 0.08536487487484673


### Save to Local File

In [27]:
def save_json_as_file(file_path:str, item: dict):
    with open(file_path, 'w') as json_file:
        json.dump(item, json_file)  

save_json_as_file("./jsons/temp.json", {"text": "value"})

In [52]:
def replace_whitespace(str: str):
    return str.replace(" ", "-")

In [63]:
for i, node in enumerate(nodes[19:]):
  print(f"working on {i}th node of {len(nodes)}")
  node_coref_resolution = get_openai_coref_resolution(node.text)
  sentences = split_into_sentences(node_coref_resolution)
  for j, sentence in enumerate(sentences):
    print(f"working on {j}th sentence of {len(sentences)} in {i}th node")
    events = get_openai_event_extraction(sentence)
    for event in events:
      event["node_id"] = node.id_
      filename = replace_whitespace(event['title'])
      save_json_as_file(f"./jsons/{node.id_ + '-' + filename}.json", event)

working on 0th node of 69
working on 0th sentence of 8 in 0th node
working on 1th sentence of 8 in 0th node
working on 2th sentence of 8 in 0th node
working on 3th sentence of 8 in 0th node
working on 4th sentence of 8 in 0th node
working on 5th sentence of 8 in 0th node
working on 6th sentence of 8 in 0th node
working on 7th sentence of 8 in 0th node
working on 1th node of 69
working on 0th sentence of 8 in 1th node
working on 1th sentence of 8 in 1th node
working on 2th sentence of 8 in 1th node
working on 3th sentence of 8 in 1th node
working on 4th sentence of 8 in 1th node
working on 5th sentence of 8 in 1th node
working on 6th sentence of 8 in 1th node
working on 7th sentence of 8 in 1th node
working on 2th node of 69
working on 0th sentence of 6 in 2th node
working on 1th sentence of 6 in 2th node
working on 2th sentence of 6 in 2th node
working on 3th sentence of 6 in 2th node
working on 4th sentence of 6 in 2th node
working on 5th sentence of 6 in 2th node
working on 3th node 

In [51]:
v = get_openai_event_extraction("Prominent in the composition are a gored horse, a bull, screaming women, a dead baby, a dismembered soldier, and flames.")
print(v)

[{'title': 'Prominent elements in the composition', 'summary': 'The composition includes a gored horse, a bull, screaming women, a dead baby, a dismembered soldier, and flames.', 'entities': [{'entity': 'gored horse', 'relation': ['included'], 'tag': 'OBJECT'}, {'entity': 'bull', 'relation': ['included'], 'tag': 'ANIMAL'}, {'entity': 'screaming women', 'relation': ['included'], 'tag': 'OBJECT'}, {'entity': 'dead baby', 'relation': ['included'], 'tag': 'OBJECT'}, {'entity': 'dismembered soldier', 'relation': ['included'], 'tag': 'OBJECT'}, {'entity': 'flames', 'relation': ['included'], 'tag': 'OBJECT'}]}]


### Send to Neo4j Database

In [3]:
NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD")
NEO4J_LINK = os.environ.get("NEO4J_LINK")

def connect_to_auradb():
    uri = NEO4J_LINK
    username = "neo4j"
    password = NEO4J_PASSWORD

    driver = neo4j.GraphDatabase.driver(uri, auth=(username, password))
    return driver

driver = connect_to_auradb()

In [4]:
print(driver.get_server_info())
print(driver.verify_connectivity())

<neo4j.api.ServerInfo object at 0x7fc971534850>
None


In [42]:
def replace_ud(input_string):
    if input_string == 'Picasso': input_string = 'Pablo_Picasso'
    return input_string.replace(" ", "_").replace("'", "").replace("(", "").replace(")", "").replace("-", "_")

In [44]:
def add_event_and_entities(event, entities):
    parameters = {}
    event_name = replace_ud(event["title"])
    event_summary = replace_ud(event["summary"])
    node_id = event["node_id"]

    try:
        ## Add Event
        summary = driver.execute_query(
            f"""
            MERGE (p:EVENT {"{name: '" + event_name + "'}"})
            SET p.summary = '{event_summary}'
            SET p.id = '{node_id}'
            """,
            parameters_=parameters,
            database_="neo4j",  
        ).summary

        print("Created {nodes_created} nodes in {time} ms.".format(
            nodes_created=summary.counters.nodes_created,
            time=summary.result_available_after
        ))
        
        for entity in entities:
            entity_name = replace_ud(entity["entity"])
            entity_type = entity["tag"]
            
            summary = driver.execute_query(
                f"""
                MERGE (p:{entity_type} {"{name: '" + entity_name + "'}"})
                """,
                database_="neo4j",  
            ).summary

            print("Created {nodes_created} nodes in {time} ms.".format(
                nodes_created=summary.counters.nodes_created,
                time=summary.result_available_after
            ))
            
            relations = entity["relation"]
            
            for relation in relations:
                relation_name = replace_ud(relation)
                summary = driver.execute_query(
                    f"""
                    MATCH (p:EVENT {"{name: '" + event_name + "'}"}), (m:{entity_type} {"{name: '" + entity_name + "'}"})
                    MERGE (p)-[:{relation_name}]->(m)
                    """,
                    database_="neo4j",  
                ).summary

            print("Created {relationships_created} relationships in {time} ms.".format(
                relationships_created=summary.counters.relationships_created,
                time=summary.result_available_after
            ))
    except Exception as e:
        print(e)



In [None]:
def add_two_node_and_relationship(head_node: str, head_node_type: str, tail_node: str, tail_node_type: str, relation: str):
    parameters = {}

    try:
        summary = driver.execute_query(
            f"""
            MERGE (:{head_node_type} {"{name: '" + head_node + "'}"})
            MERGE (:{tail_node_type} {"{name: '" + tail_node + "'}"})
            """,
            parameters_=parameters,
            database_="neo4j",  
        ).summary

        print("Created {nodes_created} nodes in {time} ms.".format(
            nodes_created=summary.counters.nodes_created,
            time=summary.result_available_after
        ))

        summary = driver.execute_query(
            f"""
            MATCH (p:{head_node_type} {"{name: '" + head_node + "'}"}), (m:{tail_node_type} {"{name: '" + tail_node + "'}"})
            MERGE (p)-[:{relation}]->(m)
            """,
            database_="neo4j",  
        ).summary

        print("Created {relationships_created} relationships in {time} ms.".format(
            relationships_created=summary.counters.relationships_created,
            time=summary.result_available_after
        ))
    except Exception as e:
        print(e)



In [15]:

def load_json_files_from_directory(directory_path):
    json_data_list = []

    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r') as file:
                json_data = json.load(file)
                json_data_list.append(json_data)

    return json_data_list

In [18]:
json_list = load_json_files_from_directory("../data/jsons/events-attempt-1")

In [45]:
for json in json_list:
  add_event_and_entities(json, json["entities"])

Created 1 nodes in 10 ms.
Created 1 nodes in 20 ms.
Created 1 relationships in 73 ms.
Created 1 nodes in 19 ms.
Created 1 relationships in 63 ms.
Created 1 nodes in 9 ms.
Created 0 nodes in 2 ms.
Created 1 relationships in 66 ms.
Created 1 nodes in 19 ms.
Created 1 relationships in 43 ms.
Created 1 nodes in 18 ms.
Created 1 relationships in 38 ms.
Created 1 nodes in 12 ms.
Created 0 nodes in 1 ms.
Created 1 relationships in 80 ms.
Created 1 nodes in 19 ms.
Created 1 relationships in 82 ms.
Created 1 nodes in 36 ms.
Created 1 relationships in 70 ms.
Created 1 nodes in 33 ms.
Created 1 relationships in 76 ms.
Created 1 nodes in 30 ms.
Created 1 relationships in 66 ms.
Created 1 nodes in 9 ms.
Created 0 nodes in 2 ms.
Created 1 relationships in 67 ms.
Created 1 nodes in 19 ms.
Created 1 relationships in 64 ms.
Created 1 nodes in 11 ms.
Created 0 nodes in 2 ms.
Created 1 relationships in 78 ms.
Created 1 nodes in 32 ms.
Created 1 relationships in 77 ms.
Created 1 nodes in 22 ms.
Created 1 

Failed to read from defunct connection IPv4Address(('4c6bde68.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('34.126.64.110', 7687)))
Failed to read from defunct connection ResolvedIPv4Address(('34.126.64.110', 7687)) (ResolvedIPv4Address(('34.126.64.110', 7687)))
Unable to retrieve routing information
Transaction failed and will be retried in 0.9138294258973667s (Unable to retrieve routing information)


Failed to read from defunct connection IPv4Address(('4c6bde68.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('34.126.64.110', 7687)))


Unable to retrieve routing information
Transaction failed and will be retried in 2.3100131868602185s (Unable to retrieve routing information)
Unable to retrieve routing information
Transaction failed and will be retried in 3.3175943480776544s (Unable to retrieve routing information)
Unable to retrieve routing information
Transaction failed and will be retried in 8.876702018971553s (Unable to retrieve routing information)


KeyboardInterrupt: 