In [36]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from graphdatascience import GraphDataScience
from getpass import getpass
import time

# Set up connection information

In [37]:
openai_api_key = getpass()

 ········


In [38]:
neo4j_password = getpass()

 ········


In [39]:
neo4j_uri = "neo4j+s://2fe3bf28.databases.neo4j.io"
neo4j_user = "neo4j"
gds = GraphDataScience(neo4j_uri, auth=(neo4j_user, neo4j_password))

Failed to write data to connection ResolvedIPv4Address(('34.28.32.244', 7687)) (ResolvedIPv4Address(('34.28.32.244', 7687)))
Failed to write data to connection IPv4Address(('2fe3bf28.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.32.244', 7687)))


# Get ThemeGroup nodes that need a summary

In [68]:
movie_info = gds.run_cypher("""
MATCH (g:ThemeGroup)<-[:IN_GROUP]-()<-[:HAS_THEME]-(m)
WHERE g.summary IS null
WITH g, collect(distinct m{.title, .overview}) AS movieData
RETURN g.id AS id, g.descriptions as themes, apoc.coll.randomItems(movieData, $resultLimit) AS movie_data""",
              {"resultLimit": 20})

In [69]:
movie_info

Unnamed: 0,id,themes,movie_data
0,4166,"[Annihilator, Exterminate, annihilation, exter...","[{'title': 'KUEBIPUTO', 'overview': 'A punk sc..."
1,4176,"[bar, bars, box, boxes, boxing]","[{'title': 'Jamel Herring vs. Nick Molina', 'o..."
2,4179,"[Captures, captivating, captive, captivity, ca...","[{'title': 'Red Fox', 'overview': 'In a baron ..."
3,4180,"[Cheerleader, Cheerleading, cheer, cheerful]","[{'title': 'Holy Rosita', 'overview': 'Rosita ..."
4,269,"[Christmas celebration, Christmas celebrations]","[{'title': 'Aunt Virginia', 'overview': 'Aunt ..."
...,...,...,...
2910,7085,[cruising],[{'title': 'How cruise ships became a catastro...
2911,7086,[countess],[{'title': 'Le Nozze Di Figaro - Opera Ballet ...
2912,7087,[Ariana Grande],"[{'title': 'yes, and?', 'overview': 'music vid..."
2913,7088,[wheat],"[{'title': 'The Songs of Red Tresses: Woman, l..."


# Set up smmary chain

In [70]:
chat = ChatOpenAI(temperature=0, model="gpt-3.5-turbo", openai_api_key=openai_api_key)

In [71]:
system_message = SystemMessage(
    content="""You are a movie expert. 
    You will be given a list of themes and a list of movies that contain those themes.
    Write two or three sentences that describe the what the themes have in common.
    Use the example movie information to guide your description of the themes but do not include the titles of any movies in your sentences.""")
final_prompt = ChatPromptTemplate.from_messages(
    [system_message,
     ("human", """themes: {themes}
     movie information: {movie_data}""")])

## Test the chain

In [72]:
movie_info.iloc[10,1]

['Guest', 'Visitors', 'guests', 'visitor']

In [73]:
chain = final_prompt | chat

In [74]:
response = chain.invoke({"movie_data": movie_info.iloc[10,2], "themes": movie_info.iloc[10,1]})

In [75]:
response.content

"The themes of 'Guest' and 'Visitor' both involve individuals who are not permanent residents or members of a particular place or group. These themes often introduce new characters or elements into a story, creating opportunities for unexpected interactions, revelations, or conflicts. Whether it's a special guest at a ministerial service or an unexpected visitor in someone's dreams, these themes can add intrigue and depth to a narrative."

In [76]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai_api_key)

# Send summaries with their embeddings to Neo4j

In [None]:
gds.run_cypher("""CREATE VECTOR INDEX theme_group_summary_vectors IF NOT EXISTS 
                  FOR (g:ThemeGroup)
                  ON (g.summaryEmbedding)
                  OPTIONS {indexConfig: 
                      {`vector.dimensions`: 1536,
                       `vector.similarity_function`: 'cosine'
                       }}
                       """)

In [79]:
def get_theme_group_description(start_index, end_index):
    movie_slice = movie_info.iloc[start_index:end_index].copy()
    movie_slice['response'] = chain.batch(movie_slice[['movie_data', 'themes']].to_dict("records"))
    movie_slice['summary'] = movie_slice['response'].map(lambda x: x.content)
    movie_slice['embedding'] = embeddings.embed_documents(movie_slice['summary'].tolist())
    gds.run_cypher("""
        UNWIND $data AS row
        MATCH (g:ThemeGroup {id:row['id']})
        SET g.summary = row['summary']
        WITH g, row
        CALL db.create.setNodeVectorProperty(g, 'summaryEmbedding', row['embedding']) """, 
                   {"data": movie_slice[["id", "summary", "embedding"]].to_dict("records")})
    return movie_slice

In [81]:
for i in range(0, int(movie_info.shape[0]/10) + 1):
    get_theme_group_description(i*10,(i+1)*10)
    time.sleep(3)
    if (i + 1) % 5 == 0:
        print(f"Finished row {(i+1)*10}")



Finished row 50




Finished row 100




Finished row 150




Finished row 200




Finished row 250




Finished row 300




Finished row 350




Finished row 400




Finished row 450




Finished row 500




Finished row 550




Finished row 600




Finished row 650




Finished row 700




Finished row 750




Finished row 800




Finished row 850




Finished row 900




Finished row 950




Finished row 1000




Finished row 1050




Finished row 1100




Finished row 1150




Finished row 1200




Finished row 1250




Finished row 1300




Finished row 1350




Finished row 1400




Finished row 1450




Finished row 1500




Finished row 1550




Finished row 1600




Finished row 1650




Finished row 1700




Finished row 1750




Finished row 1800




Finished row 1850




Finished row 1900




Finished row 1950




Finished row 2000




Finished row 2050




Finished row 2100




Finished row 2150




Finished row 2200




Finished row 2250




Finished row 2300




Finished row 2350




Finished row 2400




Finished row 2450




Finished row 2500




Finished row 2550




Finished row 2600




Finished row 2650




Finished row 2700




Finished row 2750




Finished row 2800




Finished row 2850




Finished row 2900


