In [2]:
from langchain_openai import OpenAIEmbeddings
from graphdatascience import GraphDataScience
from getpass import getpass
import pandas as pd
import os
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from IPython.display import display
import string

  from .autonotebook import tqdm as notebook_tqdm


# Set up connection to Neo4j

In [3]:
neo4j_password = getpass()

 ········


In [4]:
openai_api_key = getpass()

 ········


In [5]:
neo4j_uri = "neo4j+s://2fe3bf28.databases.neo4j.io"
neo4j_user = "neo4j"
gds = GraphDataScience(neo4j_uri, auth=(neo4j_user, neo4j_password))

# Find themes with common stems

In [6]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nathansmith/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
theme_df = gds.run_cypher("""
    MATCH (t:Theme) 
    RETURN toLower(t.description) AS description 
    """)

In [8]:
theme_df['stem'] = theme_df['description'].map(lambda x: lemmatizer.lemmatize(x))

In [9]:
table = str.maketrans("", "", string.punctuation)

In [10]:
theme_df['no_space_stem'] = theme_df['stem'].str.replace(" ", "")
theme_df['no_space_stem'] = theme_df['no_space_stem'].str.translate(table)

In [11]:
theme_df[theme_df['description'] == 'haunted house']

Unnamed: 0,description,stem,no_space_stem
28937,haunted house,haunted house,hauntedhouse


In [12]:
grouped_themes = theme_df[['description', 'no_space_stem']].groupby('no_space_stem').apply(np.unique, include_groups=False)

In [13]:
shared_stems = grouped_themes[grouped_themes.map(len)>1]
shared_stems_df = shared_stems.reset_index()
shared_stems_df.columns = ['stem', 'descriptions']
shared_stems_df.shape

(1533, 2)

In [14]:
shared_stems_df.sample(10)

Unnamed: 0,stem,descriptions
774,lily,"[lilies, lily]"
523,fly,"[flies, fly]"
1156,rumor,"[rumor, rumors]"
1069,punk,"[punk, punks]"
294,copper,"[copper, coppers]"
1325,summer,"[summer, summers]"
1146,rockstar,"[rock star, rockstar]"
1440,undertaker,"[undertaker, undertakers]"
257,colonial,"[colonial, colonials]"
35,american,"[american, americans]"


## Send stems shared by multiple themes to Neo4j

In [14]:
gds.run_cypher("""CREATE CONSTRAINT stem_node_key IF NOT EXISTS FOR (s:Stem) REQUIRE s.stem IS NODE KEY""")

In [15]:
gds.run_cypher("""UNWIND $data AS row
MERGE (s:Stem {stem:row['stem']})
WITH row, s
UNWIND row['descriptions'] AS description
MATCH (t:Theme)
WHERE toLower(t.description) = description
MERGE (t)-[:HAS_STEM]->(s)
SET s.descriptions = row['descriptions']
RETURN count(*) AS stemmedCount""",
              {"data": shared_stems_df.to_dict("records")})

Unnamed: 0,stemmedCount
0,3111


## Aggregate theme vectors for themes that share a stem

Set the stem's vector to whichever theme shows up most often in movies

In [16]:
gds.run_cypher("""
MATCH (s:Stem)
CALL {
WITH s
MATCH (s)<-[:HAS_STEM]-(t)
WITH s, t
ORDER BY count{(t)<-[:HAS_THEME]-()} DESC
WITH s, collect(t) AS themes
CALL db.create.setNodeVectorProperty(s, "embedding", themes[0].embedding)
}
""")

In [17]:
shared_stem_df = gds.run_cypher("""
MATCH (s:Stem)<-[:HAS_STEM]-(t)
RETURN s.stem AS stem, collect(t.embedding) AS embeddings
""")

In [18]:
def get_mean_vector(row):
    mean_vector = np.array(row['embeddings']).mean(axis=0).tolist()
    return mean_vector

In [19]:
shared_stem_df['mean_embedding'] = shared_stem_df.apply(get_mean_vector, axis=1)

In [20]:
gds.run_cypher("""
    UNWIND $data AS row
    MATCH (s:Stem {stem: row['stem']})
    CALL db.create.setNodeVectorProperty(s, "embedding", row['mean_embedding'])""",
                   {"data":shared_stem_df[['stem', 'mean_embedding']].to_dict("records")})


# Create projection
Include stems and individual themes that do not share a stem with another theme.

Use the theme embedding and the aggregated embedding for stems.

In [22]:
gds.run_cypher("""
MATCH (t:Stem|Theme)
WHERE NOT EXISTS {(t)-[:HAS_STEM]->()}
SET t:Groupable""")

In [24]:
gds.run_cypher("""CREATE VECTOR INDEX groupbale_vectors IF NOT EXISTS 
                  FOR (t:Groupable)
                  ON (t.embedding)
                  OPTIONS {indexConfig: 
                      {`vector.dimensions`: 1536,
                       `vector.similarity_function`: 'cosine'
                       }}
                       """)

In [23]:
g_themes, result = gds.graph.cypher.project("""
MATCH (t:Groupable)
RETURN gds.graph.project("themes", 
t, null,
{sourceNodeProperties: t{.embedding},
targetNodeProperties: null}
)""")

In [24]:
result

{'relationshipCount': 0,
 'graphName': 'themes',
 'query': '\nMATCH (t:Groupable)\nRETURN gds.graph.project(******, \nt, ******,\n{sourceNodeProperties: t{.embedding},\ntargetNodeProperties: ******}\n)',
 'projectMillis': 4917,
 'configuration': {'readConcurrency': 4,
  'undirectedRelationshipTypes': [],
  'jobId': '75218b37-abf9-41fd-a6c2-567806a51835',
  'logProgress': True,
  'query': '\nMATCH (t:Groupable)\nRETURN gds.graph.project(******, \nt, ******,\n{sourceNodeProperties: t{.embedding},\ntargetNodeProperties: ******}\n)',
  'inverseIndexedRelationshipTypes': [],
  'creationTime': neo4j.time.DateTime(2024, 3, 25, 14, 2, 56, 569765655, tzinfo=<UTC>)},
 'nodeCount': 38513}

# Test some theme pairs to find a score above which you think themes are probably duplicated
Get a sense of the range of similarity values for top 5 nearest neighbors.

Note that KNN returns cosine similarity normalized to the range [0, 1]. 

If you run gds.similarity.cosine() you get similarity in the range [-1,1]

In [25]:
knn_stats = gds.knn.stats(g_themes, topK=3, nodeProperties = "embedding")

In [26]:
knn_stats['similarityDistribution']

{'min': 0.6232261657714844,
 'p5': 0.6851768493652344,
 'max': 0.9789695739746094,
 'p99': 0.9272689819335938,
 'p1': 0.6668815612792969,
 'p10': 0.6967124938964844,
 'p90': 0.8597068786621094,
 'p50': 0.7637672424316406,
 'p25': 0.722442626953125,
 'p75': 0.8146820068359375,
 'p95': 0.8844757080078125,
 'mean': 0.7717795717981278,
 'p100': 0.9789695739746094,
 'stdDev': 0.06212606247669753}

## Given a theme, check to see how many neighbors are duplicates and the similarity score for the first neighbor you would consider to be a different theme.

db.index.vector.queryNodes() returns cosine similarity normalized to the range [0, 1]

In [28]:
top_k_df = gds.run_cypher("""
MATCH (t1:Groupable) WHERE t1.description IN $themeList OR ANY(str IN $themeList WHERE str IN t1.descriptions)
CALL db.index.vector.queryNodes("groupbale_vectors", $k + 1, t1.embedding) YIELD node, score
WITH t1, node AS t2, score
WHERE t1 <> t2
RETURN coalesce(t1.description, reduce(str=t1.descriptions[0], x in t1.descriptions[1..] | str + ", " + x)) AS theme1,
coalesce(t2.description, reduce(str=t2.descriptions[0], x in t2.descriptions[1..] | str + ", " + x)) AS theme2,
score
ORDER BY theme1, score DESC""",
               {"k": 20,
                "themeList": ['friend', 'dance floor', 'fast food', 'Influential artist', 'antics', 
                              'ants', 'underwater', 'Africa', 'chatgpt', 'coming-of-age']})

In [26]:
top_k_df['theme1'].unique()

array(['Africa', 'Fast-food burger', 'Influential artist', 'antics',
       'ants', 'chat-gpt, chatgpt', 'coming of age, coming-of-age',
       'dance floor, dancefloor', 'friend, friends', 'underwater'],
      dtype=object)

In [29]:
top_k_df[top_k_df['theme1'] == 'friend, friends']
# top 2 cutoff 0.825685

Unnamed: 0,theme1,theme2,score
140,"friend, friends",friend group,0.843112
141,"friend, friends","friendship, friendships",0.836408
142,"friend, friends",Friendly,0.825685
143,"friend, friends",school friends,0.823544
144,"friend, friends",close friends,0.818371
145,"friend, friends",partner,0.817516
146,"friend, friends",old friend,0.811989
147,"friend, friends",true friend,0.811861
148,"friend, friends",rich friends,0.809388
149,"friend, friends",best friend,0.807411


In [30]:
top_k_df[top_k_df['theme1'] == 'dance floor, dancefloor']
# top 0 cutoff 0.871256

Unnamed: 0,theme1,theme2,score
120,"dance floor, dancefloor",dance party,0.871256
121,"dance floor, dancefloor",Dance music,0.83875
122,"dance floor, dancefloor","dance, dances",0.838403
123,"dance floor, dancefloor",dance time,0.824521
124,"dance floor, dancefloor",go-go dancer,0.820104
125,"dance floor, dancefloor",dancing,0.817867
126,"dance floor, dancefloor",dance routines,0.815567
127,"dance floor, dancefloor",dance class,0.814394
128,"dance floor, dancefloor",Dance diva,0.81409
129,"dance floor, dancefloor",dance studio,0.813564


In [29]:
top_k_df[top_k_df['theme1'] == 'fast food']
# top 2 cutoff 0.804882

Unnamed: 0,theme1,theme2,score
140,fast food,fast food restaurant,0.936906
141,fast food,Fast-food burger,0.845039
142,fast food,street food,0.804882
143,fast food,bad food,0.793561
144,fast food,night food,0.779074
145,fast food,family restaurant,0.768764
146,fast food,"burger, burgers",0.767863
147,fast food,food service,0.767081
148,fast food,KFC,0.765815
149,fast food,traditional food,0.763055


In [27]:
top_k_df[top_k_df['theme1'] == 'Influential artist']
# top 9 cutoff 0.787514

Unnamed: 0,theme1,theme2,score
40,Influential artist,influential figure,0.858566
41,Influential artist,influential man,0.857857
42,Influential artist,influential designer,0.839318
43,Influential artist,influential,0.835377
44,Influential artist,Prominent artists,0.832661
45,Influential artist,Era-defining artists,0.813614
46,Influential artist,legendary artists,0.810524
47,Influential artist,visionary artist,0.802965
48,Influential artist,Art pioneers,0.791674
49,Influential artist,Music legend,0.787514


In [33]:
top_k_df[top_k_df['theme1'] == 'antics']
# top 0 cutoff 0.854217

Unnamed: 0,theme1,theme2,score
40,antics,ants,0.854217
41,antics,scope,0.829578
42,antics,aliases,0.827421
43,antics,iteration,0.823729
44,antics,logic,0.822214
45,antics,dream project,0.821021
46,antics,webs,0.819675
47,antics,ambiguous,0.815373
48,antics,genesis,0.813637
49,antics,Elsewhere,0.809501


In [34]:
top_k_df[top_k_df['theme1'] == 'ants']
# top 0 cutoff 0.854217

Unnamed: 0,theme1,theme2,score
60,ants,antics,0.854217
61,ants,"agent, agents",0.836309
62,ants,"rat, rats",0.814807
63,ants,webs,0.812752
64,ants,ads,0.807464
65,ants,pies,0.80552
66,ants,target,0.804613
67,ants,stocks,0.802675
68,ants,mutations,0.797931
69,ants,"name, names",0.795486


In [35]:
top_k_df[top_k_df['theme1'] == 'underwater']
# top 4 cutoff 0.880904

Unnamed: 0,theme1,theme2,score
180,underwater,undersea,0.946337
181,underwater,subaquatic,0.905288
182,underwater,underwater world,0.896511
183,underwater,undersea world,0.881088
184,underwater,underwater music,0.880904
185,underwater,underwater adventure,0.875346
186,underwater,underwater exploration,0.873639
187,underwater,deep sea,0.857809
188,underwater,underwater ballet,0.847281
189,underwater,submarine,0.846147


In [36]:
top_k_df[top_k_df['theme1'] == 'Africa']
# top 0 cutoff 0.829958

Unnamed: 0,theme1,theme2,score
0,Africa,Asia,0.829958
1,Africa,Europe,0.793405
2,Africa,African,0.79219
3,Africa,"america, americas",0.780679
4,Africa,Atlantic,0.775041
5,Africa,Asian,0.750807
6,Africa,African Union,0.750721
7,Africa,Pan-African,0.749782
8,Africa,African nation,0.748957
9,Africa,Horn of Africa,0.748596


In [37]:
top_k_df[top_k_df['theme1'] == 'chat-gpt, chatgpt']
# top 1 cutoff 0.725718

Unnamed: 0,theme1,theme2,score
80,"chat-gpt, chatgpt",GPT-3,0.82086
81,"chat-gpt, chatgpt",Hangouts,0.725718
82,"chat-gpt, chatgpt",G,0.721377
83,"chat-gpt, chatgpt",CG,0.718669
84,"chat-gpt, chatgpt",GeGeGe,0.713018
85,"chat-gpt, chatgpt","game show, gameshow",0.711933
86,"chat-gpt, chatgpt",CGI,0.711643
87,"chat-gpt, chatgpt",competitive gaming,0.709213
88,"chat-gpt, chatgpt",Online Game,0.708129
89,"chat-gpt, chatgpt",GUI,0.706613


In [38]:
top_k_df[top_k_df['theme1'] == 'coming of age, coming-of-age']
# top 3 cutoff 0.792982

Unnamed: 0,theme1,theme2,score
100,"coming of age, coming-of-age","growing up, growing-up",0.826383
101,"coming of age, coming-of-age",adolescence,0.815145
102,"coming of age, coming-of-age",puberty,0.797296
103,"coming of age, coming-of-age",boyhood,0.792982
104,"coming of age, coming-of-age",post-adolescence,0.786747
105,"coming of age, coming-of-age",lost youth,0.770889
106,"coming of age, coming-of-age",youthful adventure,0.768444
107,"coming of age, coming-of-age",Transition to adulthood,0.763974
108,"coming of age, coming-of-age",teenage angst,0.762358
109,"coming of age, coming-of-age",youth romance,0.759862


# Based on the exploration above run KNN with a similarityCutoff you choose

In [39]:
similarity_cutoff = 0.83

knn_result = gds.knn.mutate(g_themes,
                            mutateRelationshipType = "IS_SIMILAR",
                            mutateProperty = "similarity",
                            nodeProperties = "embedding",
                            topK = 2,
                            sampleRate = 1.0,
                            perturbationRate = 0.5,
                            deltaThreshold = 0.00001,
                            similarityCutoff = similarity_cutoff)

## Find themes that discovered no neighbors above the similarity threshold you selected

In [40]:
no_sim_df = gds.run_cypher("""
CALL gds.graph.relationships.stream("themes", ["IS_SIMILAR"]) yield sourceNodeId, targetNodeId
UNWIND [gds.util.asNode(sourceNodeId), gds.util.asNode(targetNodeId)] AS nodeWithSim
WITH collect(DISTINCT nodeWithSim) AS nodesWithSim
MATCH (t:Theme) WHERE NOT t IN nodesWithSim
AND NOT EXISTS {(t)-[:HAS_STEM]->()}
RETURN t.description AS description, count{ (t)<-[:HAS_THEME]-() } AS movieCount
ORDER BY movieCount desc""")

In [41]:
no_sim_df

Unnamed: 0,description,movieCount
0,revenge,175
1,nature,148
2,art,79
3,love story,78
4,forbidden love,74
...,...,...
28178,viagem,1
28179,vibrant plea,1
28180,vibraphonist,1
28181,vicar scandal,1


In [42]:
no_sim_df[no_sim_df['movieCount'] > 1]['description'].count()

4725

## Examine theme pairs for high degree nodes
For Stem nodes, turn their list of descriptions into a comma separated string before exporting.

In [43]:
similarity_result = gds.run_cypher("""
    CALL gds.graph.relationshipProperty.stream("themes", "similarity", ["IS_SIMILAR"])
    YIELD sourceNodeId, targetNodeId, propertyValue AS similarity
    WITH gds.util.asNode(sourceNodeId) AS source, gds.util.asNode(targetNodeId) AS target, similarity
    RETURN coalesce(source.description, reduce(s = source.descriptions[0], d in source.descriptions[1..] | s + ", " + d)) AS source, 
    coalesce(target.description, reduce(s = target.descriptions[0], d in target.descriptions[1..] | s + ", " + d)) AS target, similarity
    ORDER BY similarity""")
                    

What are the nodes with the most inbound IS_SIMILAR relationships?

In [44]:
similarity_result.groupby("target")['source'].count().sort_values(ascending=False).head(20)

target
Christmas magic          16
Christmas spirit         14
family relationships     13
haunted mansion          12
childhood                12
marriage struggles       11
surreal                  11
aging queer              11
Indigenous cultures      11
filmmaking               11
Wild journey             10
missing mother           10
elderly woman            10
relationship troubles    10
mother, mothers          10
future world             10
Gay professional         10
crime gangs              10
queer                    10
Art space                10
Name: source, dtype: int64

In [45]:
similarity_result[similarity_result['target']=="haunted house"]

Unnamed: 0,source,target,similarity
6126,Haunted train,haunted house,0.856283
8272,haunted book,haunted house,0.869158
11382,haunted museum,haunted house,0.895871
11449,haunted hospital,haunted house,0.896431
11925,haunted villa,haunted house,0.90346
13448,haunted mansion,haunted house,0.944185


In [46]:
similarity_result[similarity_result['target']=="hidden secrets"]

Unnamed: 0,source,target,similarity
2472,hidden,hidden secrets,0.839962
5796,shocking secrets,hidden secrets,0.854482
7633,unsettling secrets,hidden secrets,0.865214
7940,hidden feelings,hidden secrets,0.866977
8262,secrets exposed,hidden secrets,0.86908
9915,forbidden secrets,hidden secrets,0.881313
11526,past secrets,hidden secrets,0.897386
11575,Dirty secrets,hidden secrets,0.898116
12324,dark secrets,hidden secrets,0.90953
12415,"secret, secrets",hidden secrets,0.910885


In [47]:
similarity_result[similarity_result['target']=="Christmas magic"]

Unnamed: 0,source,target,similarity
4547,Christmas songs,Christmas magic,0.848337
7137,Christmas story,Christmas magic,0.86215
8239,Christmas competition,Christmas magic,0.868947
8547,Christmas traditions,Christmas magic,0.871038
9156,Xmas,Christmas magic,0.875206
9240,Christmas mystery,Christmas magic,0.875824
9555,Christmas tradition,Christmas magic,0.878228
9962,christmas,Christmas magic,0.881683
10449,Christmas ambience,Christmas magic,0.885654
10761,Christmas tree,Christmas magic,0.888673


In [48]:
similarity_result[similarity_result['target']=="documentary video"]

Unnamed: 0,source,target,similarity
4243,meta-documentary,documentary video,0.84684
7911,animated documentary,documentary video,0.866787
9931,documentary crew,documentary video,0.88145
11977,documentary interviews,documentary video,0.903989
13339,"documentaries, documentary",documentary video,0.938533
13528,documentary film,documentary video,0.950835


# Write weakly connected components with threshold at your KNN similarityCutoff or a higher value
I'm bumping the threshold up to 0.875

In [49]:
gds.wcc.write(g_themes,
              writeProperty = "wccId",
              relationshipTypes = ["IS_SIMILAR"],
              relationshipWeightProperty = "similarity",
              threshold = 0.875,
              minComponentSize=2)

writeMillis                                                             41
nodePropertiesWritten                                                 3605
componentCount                                                       36237
componentDistribution    {'min': 1, 'p5': 1, 'max': 29, 'p999': 7, 'p99...
postProcessingMillis                                                    34
preProcessingMillis                                                      0
computeMillis                                                           23
configuration            {'writeProperty': 'wccId', 'jobId': '9615c40e-...
Name: 0, dtype: object

## Examine the largest communities at the KNN value you selected

In [50]:
pd.set_option('display.max_colwidth', None)

In [51]:
gds.run_cypher("""
MATCH (t:Theme)
WHERE t.wccId IS NOT NULL
WITH t.wccId AS id, collect(t.description) AS themes
RETURN id, size(themes) AS themeCount, themes
ORDER BY themeCount DESC
LIMIT 30""")

Unnamed: 0,id,themeCount,themes
0,1964,29,"[Christmas ambience, Christmas break, Christmas celebration, Christmas celebrations, Christmas competition, Christmas decorations, Christmas eve, Christmas gift, Christmas love, Christmas magic, Christmas market, Christmas memories, Christmas movie, Christmas mystery, Christmas nightmare, Christmas party, Christmas setting, Christmas spirit, Christmas terror, Christmas tradition, Christmas traditions, Christmas tree, Christmas wedding, Christmas wish, Christmas wishes, Holiday traditions, Holiday wishes, Xmas, christmas]"
1,653,25,"[Russian Gas, Russian actor, Russian army, Russian cinema, Russian composer, Russian criminals, Russian culture, Russian defiance, Russian director, Russian forces, Russian literature, Russian mission, Russian mother, Russian occupation, Russian oligarch, Russian owner, Russian performers, Russian setting, Russian soldier, Russian spies, Russian students, Russian teacher, Russian team, Russian town, Russian troops]"
2,2094,23,"[Gay Agenda, Gay awakening, Gay conversion, Gay couple, Gay culture, Gay elder, Gay history, Gay killer, Gay life, Gay love, Gay professional, Gay relationship, Gay relationships, LGBTQ+ couple, Queer couple, Same-sex love, gay artist, gay identity, gay lord, gay man, gay rights, gay romance, same-sex relationships]"
3,1167,18,"[Horrifying nightmare, Surreal dream, Surreal fantasy, Surreal imagery, Surreal landscapes, bad dream, mystical landscape, nightmare begins, subconscious nightmares, surreal, surreal animation, surreal dreams, surreal landscape, surreal nightmare, surreal nightmares, surreal vision, surrealism, surrealist]"
4,1681,17,"[Dirty secret, Dirty secrets, Hidden desire, Secret desire, Secret passion, dark secret, dark secrets, dark truth, forbidden desires, forbidden secrets, hidden secrets, past secrets, secret desires, secret knowledge, secret living, secret past, secret relationship]"
5,1495,16,"[Countryside cottage, Countryside village, Rural community, Rural lifestyle, country life, countryside, countryside life, rural, rural areas, rural backdrop, rural communities, rural countryside, rural life, rural scenery, rural setting, rural world]"
6,50,15,"[Unexplainable events, Weird events, enigmatic event, mysterious events, mysterious incident, strange, strange behavior, strange encounters, strange events, strange performance, strange phenomena, strange phenomenon, strange places, strange things, weird]"
7,203,13,"[Weird incidents, bizarre incident, eerie encounters, mysterious encounters, serendipitous encounter, strange encounter, strange event, strange occurrences, surreal encounters, unexpected encounter, unexpected encounters, unexplained events, weird incident]"
8,72,13,"[Crime Gang, Criminal family, Criminal group, armed gang, crime families, crime family, crime gangs, crime lords, crime syndicate, criminal empire, criminal gang, criminal groups, street gangs]"
9,1112,12,"[Elderly spinster, Elderly widow, elderly, elderly care, elderly gentleman, elderly lady, elderly life, elderly man, elderly mother, elderly woman, old man, older woman]"


# Create undirected graph for Leiden

The largest WCC communities are too big. For example, "enchanted forest" and "buried body" do not belog in the same community. We'll use Leiden to make smaller comminities.

Leiden requires undirected relationships, so convert the directed IS_SIMILAR relationship to an undirected relationship called "UNDIRECTED_SIMILAR".

In [52]:
gds.run_cypher("""
call gds.graph.relationships.toUndirected("themes", {relationshipType: "IS_SIMILAR", mutateRelationshipType: "UNDIRECTED_SIMILAR", aggregation: "MAX"})""")

Unnamed: 0,inputRelationships,relationshipsWritten,mutateMillis,postProcessingMillis,preProcessingMillis,computeMillis,configuration
0,13736,16944,0,0,0,34,"{'aggregation': 'MAX', 'relationshipType': 'IS_SIMILAR', 'jobId': 'bf7b8eaf-5c7f-4c5e-be0c-6f878df1fb42', 'logProgress': True, 'concurrency': 4, 'sudo': False, 'mutateRelationshipType': 'UNDIRECTED_SIMILAR'}"


## Transform relationship weights
Because of our similarity cutoff in KNN, all of our relationship scores are between 0.811518 and 1.0. Apply a transformation to spread them out between 0.0 and 1.0.

In [53]:
gds.set_database("neo4j")

In [54]:
gds.run_cypher("""
CALL gds.graph.relationshipProperties.stream($graphName, ["similarity"], ["UNDIRECTED_SIMILAR"])
YIELD sourceNodeId, targetNodeId, propertyValue
WITH sourceNodeId AS source, 
targetNodeId AS target,
max(propertyValue) AS score
WITH gds.graph.project("reweight_themes",
source,
target,
{relationshipType: "IS_SIMILAR",
relationshipProperties: {similarity: (score-$similarityCutoff)/(1-$similarityCutoff)}},
{undirectedRelationshipTypes: ["IS_SIMILAR"]}) AS g
RETURN g.graphName AS graphName, g.relationshipCount AS relationshipCount, g.nodeCount AS nodeCount""",
              {"graphName": g_themes.name(), "similarityCutoff": similarity_cutoff})

Unnamed: 0,graphName,relationshipCount,nodeCount
0,reweight_themes,33888,9319


In [55]:
g_themes2 = gds.graph.get("reweight_themes")

# Test Leiden with different values of gamma
This function will run the Leiden algorithm with a value of gamma that you provide. It will return the size of the biggest Leiden communities and the themes in those communities. It will also check for the smallest communities that contain only one document.

In [56]:
def test_leiden_gamma(gamma):
    property_name = f"leidenGamma{gamma}"
    try:
        gds.graph.nodeProperties.drop(g_themes2, property_name)
    except:
        pass
    leiden_result = gds.leiden.mutate(g_themes2,
                      mutateProperty = property_name,
                      relationshipTypes = ["IS_SIMILAR"],
                      relationshipWeightProperty = "similarity",
                      gamma=gamma)
    biggest_communities = gds.run_cypher("""
        CALL gds.graph.nodeProperty.stream("reweight_themes", $propName)
        YIELD nodeId, propertyValue
        WITH propertyValue, collect(gds.util.asNode(nodeId)) AS nodeList, count(*) as themeCount
        ORDER BY themeCount desc
        LIMIT 20
        RETURN themeCount, [n in nodeList | coalesce(n.description, n.descriptions)] AS themes""",
                                        {"propName": property_name})
    stats_df = gds.run_cypher("""
        CALL gds.graph.nodeProperty.stream("reweight_themes", "leidenGamma1.0")
        YIELD nodeId, propertyValue
        WITH gds.util.asNode(nodeId) as n, propertyValue AS leidenId
        WITH leidenId, collect(n) AS nodeList
        WHERE size(nodeList) = 1
        WITH leidenId, nodeList[0] AS n
        WITH leidenId,
        COUNT{MATCH (n)<-[:HAS_THEME]-(d) RETURN DISTINCT d} AS themeCount,
        COUNT{MATCH (n)<-[:HAS_STEM]-()<-[:HAS_THEME]-(d) RETURN DISTINCT d} AS stemCount
        WHERE themeCount + stemCount = 1
        RETURN count(*) AS singleDocCommunities""",
                                           {"propName": property_name})
    stats_df.loc[0,'community_count'] = leiden_result['communityCount']
    stats_df.loc[0, 'max_community_size'] = leiden_result['communityDistribution']['max']
    stats_df.loc[0, 'mean_community_size'] = leiden_result['communityDistribution']['mean']
    stats_df.loc[0, 'median_community_size'] = leiden_result['communityDistribution']['p50']
    stats_df.loc[0, 'gamma'] = gamma
    return {"biggest_communities": biggest_communities, "stats": stats_df}
                                           


In [57]:
gamma_tests = {gamma: test_leiden_gamma(gamma) for gamma in [1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0]}

In [58]:
stats_df = pd.concat([r['stats'] for r in [gamma_tests[key] for key in gamma_tests.keys()]])
stats_df[['gamma', 'singleDocCommunities', 'community_count', 'max_community_size', 'mean_community_size', 'median_community_size']]

Unnamed: 0,gamma,singleDocCommunities,community_count,max_community_size,mean_community_size,median_community_size
0,1.0,0,2384.0,93.0,3.908977,2.0
0,2.0,0,2398.0,87.0,3.886155,2.0
0,4.0,0,2409.0,65.0,3.86841,2.0
0,8.0,0,2439.0,53.0,3.820828,2.0
0,16.0,0,2472.0,40.0,3.769822,2.0
0,32.0,0,2527.0,33.0,3.687772,2.0
0,64.0,0,2606.0,25.0,3.575979,2.0
0,128.0,0,2720.0,19.0,3.426103,2.0
0,256.0,0,2872.0,15.0,3.244777,2.0
0,512.0,0,3146.0,12.0,2.962174,2.0


In [59]:
for gamma, result in gamma_tests.items():
    print(f"Gamma value: {gamma}")
    display(result['biggest_communities'].head())

Gamma value: 1.0


Unnamed: 0,themeCount,themes
0,93,"[secret observation, secret relationship, dark stories, dark secrets, dark fantasies, secret romance, secret love, Uncovered secrets, Dirty secrets, secret mission, secret identity, secret task, unfulfilled desires, unresolved feelings, dark forest, dark path, dark walls, hidden secrets, shocking secrets, forbidden secrets, unsettling secrets, [secret, secrets], past secrets, hidden, secrets exposed, hidden feelings, Dark realities, Dark interest, Dark realm, dark presence, Dark pasts, Darker truth, dark characters, dark motives, dark history, Ancient secrets, inner darkness, dark place, darkness, Darkness beneath, lurking darkness, dark abyss, dark energy, dark force, dark, Deep dark, Dark Prison, dark moon, Dark Highway, forbidden room, forbidden desires, dark house, dark room, dark web, dark network, darknet, Old secret, Dirty secret, Secret passion, secret desires, Secret desire, invisibility, invisible, dark themes, dark time, dark twist, dark scheme, Darkness to light, Dark Light, secret recipe, secret daughters, secret identities, secret living, secret world, darkness vs. light, unresolved trauma, unresolved grief, dark thoughts, dark secret, dark truth, secret knowledge, Unspoken Desires, Hidden desire, Secret dish, Dark Void, children's secrets, buried truth, secret past, secret agenda, dark tunnel, secret online life, secret revealed, disturbing secret]"
1,87,"[Women's work, women's community, women's crafts, estranged mother, estranged parents, missing mother, ailing father, missing father, ailing son, father's fight, late father, abusive father, Lonely father, Mom, [mother, mothers], grieving neighbor, grieving father, missing daughter, estranged daughter, Daughter's struggle, missing sister, Women's support, [mother's support, mothers' support], ailing mother, lost mother, Devoted mother, alcoholic mother, Irish mother, mother's departure, mother's sacrifice, mother's return, birth mother, mother's death, loss of mother, mother's passing, deceased mother, mother disappears, mother's debt, mother's loss, father's death, loss of father, dying father, dying parent, missing mom, deceased father, Fatherhood loss, mother's son, mother's life, Maternal, motherhood, mother's, separated, estranged, mother's love, mother's quest, Estranged son, father-daughter, [father, fathers], father-daughter relationship, abusive husband, father figure, father search, maternal love, new mother, motherhood struggles, mother-daughter, Mother-daughter vacation, estranged sisters, estranged friends, young mother, Great Mother, mother's sacrifices, father-son reconciliation, mother's absence, estrangement, Suicidal mother, grieving mother, Absent mother, mother's struggle, grief-stricken, grieving, Mothers and daughters, Struggling mother, mother's vow, postpartum anxiety, postpartum struggles, Irish life]"
2,75,"[strange phenomenon, strange events, strange phenomena, strange performance, old stories, global stories, tall tales, untold stories, street tales, true stories, children's stories, funny stories, Strange creature, bizarre creature, strange creatures, Gruesome creature, true story, mysterious deaths, Mysterious disappearance, mysterious events, enigmatic, enigmatic key, enigmatic figure, [enigma, enigmas], enigmatic woman, enigmatic past, old couple, married couple, mysterious object, Mysterious phenomenon, [disappearance, disappearances], mystery creature, [mysteries, mystery], enigmatic castle, Enigmatic Writer, Unnatural occurrences, Unexplainable events, mysterious past, enigmatic event, mysterious incident, mysterious celebration, Strange happenings, bizarre events, Strange situations, Weird events, strange encounters, strange things, Children disappearing, strange couple, lesbian couple, Broken couple, Loving Couple, unsolved mystery, medical mystery, Misteri, strange behavior, Outrageous situation, Crazy situations, bizarre stories, crazy events, bizarre event, Inexplicable, strange places, Gruesome effects, Grisly, unfinished stories, bizarre, strange, children disappear, mutant creature, weird, strange object, strange town, strange terrain, unexplained]"
3,71,"[Queer visions, Queer thoughts, queer art, Queer artists, Queer filmmaking, Queer creatives, Queer legacy, Queer visibility, Queer Hero, queer media, queer community, queer culture, Queer comedy, queer woman, queer identity, queer love, Transgender narrative, gender transition, queer, aging queer, genderqueer, queer youth, queer life, queer horror, queer history, queer girl, queercore, queer artist, Queer joy, queer nightlife, queerness, LGBTQIA+, Queer lovers, Queer Sexuality, Queer mom, Queer eroticism, queer role model, queer experience, queer women, gender expression, gender identity, gender neutral, gender affirmation, LGBTQIA+ experiences, LGBTQ history, Queer sensibilities, Queer experiences, Queer resistance, Queer desires, queer liberation, queer identities, Queer protagonists, Queer journey, gender dysphoria, sexuality, homosexuality, homosexual romance, homoerotic, queer romance, Queer subculture, Queer mecca, queer struggle, queer representations, queer activism, LGBTQI+ community, Queer struggles, Queer Scene, Queer genealogy, Femme identities, Queer existences, Queer utopias]"
4,68,"[parents' home, family home, family agency, family provider, Family baggage, Family mystery, Christmas holidays, holiday magic, Holiday cheer, package holidays, familial bonds, Family bond, three families, [families, family], family secrets, Familial secrets, family memory, family ties, family traditions, family bonds, family roots, familial relationships, family relationships, Family antics, Family adventure, holiday spirit, holiday joy, holiday season, holiday gifts, family holiday, family secret, Family conversations, Family story, family wedding, family love, family life, Family pain, family values, family protection, family support, Winter holiday, [holiday, holidays], holiday special, family tradition, family business, winter break, Winter Movie, family bonding, Female bonding, family vacation home, family man, vacation friends, Family survival, family dynamics, Traditional values, conservative values, traditional family, family expectations, family reflections, family history, family move, family journey, family dignity, Conservative views, family visit, family legend, family divisions, Family culture]"


Gamma value: 2.0


Unnamed: 0,themeCount,themes
0,87,"[Women's work, women's community, women's crafts, estranged mother, estranged parents, missing mother, ailing father, missing father, ailing son, father's fight, late father, abusive father, Lonely father, Mom, [mother, mothers], grieving neighbor, grieving father, missing daughter, estranged daughter, Daughter's struggle, missing sister, Women's support, [mother's support, mothers' support], ailing mother, lost mother, Devoted mother, alcoholic mother, Irish mother, mother's departure, mother's sacrifice, mother's return, birth mother, mother's death, loss of mother, mother's passing, deceased mother, mother disappears, mother's debt, mother's loss, father's death, loss of father, dying father, dying parent, missing mom, deceased father, Fatherhood loss, mother's son, mother's life, Maternal, motherhood, mother's, separated, estranged, mother's love, mother's quest, Estranged son, father-daughter, [father, fathers], father-daughter relationship, abusive husband, father figure, father search, maternal love, new mother, motherhood struggles, mother-daughter, Mother-daughter vacation, estranged sisters, estranged friends, young mother, Great Mother, mother's sacrifices, father-son reconciliation, mother's absence, estrangement, Suicidal mother, grieving mother, Absent mother, mother's struggle, grief-stricken, grieving, Mothers and daughters, Struggling mother, mother's vow, postpartum anxiety, postpartum struggles, Irish life]"
1,73,"[strange phenomenon, strange events, strange phenomena, strange performance, old stories, global stories, tall tales, untold stories, street tales, true stories, children's stories, funny stories, Strange creature, bizarre creature, strange creatures, Gruesome creature, true story, mysterious deaths, Mysterious disappearance, mysterious events, enigmatic, enigmatic key, enigmatic figure, [enigma, enigmas], enigmatic woman, enigmatic past, old couple, married couple, mysterious object, Mysterious phenomenon, [disappearance, disappearances], mystery creature, [mysteries, mystery], enigmatic castle, Enigmatic Writer, Unnatural occurrences, Unexplainable events, mysterious past, enigmatic event, mysterious incident, mysterious celebration, Strange happenings, bizarre events, Strange situations, Weird events, strange encounters, strange things, strange couple, lesbian couple, Broken couple, Loving Couple, unsolved mystery, medical mystery, Misteri, strange behavior, Outrageous situation, Crazy situations, bizarre stories, crazy events, bizarre event, Inexplicable, strange places, Gruesome effects, Grisly, unfinished stories, bizarre, strange, mutant creature, weird, strange object, strange town, strange terrain, unexplained]"
2,65,"[Queer visions, Queer thoughts, queer art, Queer artists, Queer filmmaking, Queer creatives, Queer legacy, Queer visibility, Queer Hero, queer media, queer community, queer culture, Queer comedy, queer woman, queer identity, queer love, queer, aging queer, genderqueer, queer youth, queer life, queer horror, queer history, queer girl, queercore, queer artist, Queer joy, queer nightlife, queerness, LGBTQIA+, Queer lovers, Queer Sexuality, Queer mom, Queer eroticism, queer role model, queer experience, queer women, gender neutral, LGBTQIA+ experiences, LGBTQ history, Queer sensibilities, Queer experiences, Queer resistance, Queer desires, queer liberation, queer identities, Queer protagonists, Queer journey, sexuality, homosexuality, homosexual romance, homoerotic, queer romance, Queer subculture, Queer mecca, queer struggle, queer representations, queer activism, LGBTQI+ community, Queer struggles, Queer Scene, Queer genealogy, Femme identities, Queer existences, Queer utopias]"
3,57,"[Alien landscape, surreal landscape, mystical, nightmare begins, [nightmare, nightmares], mystical terror, supernatural terror, Surreal fantasy, Surreal dream, surreal, fantasy world, Surreal comedy, Surreal visions, surreal nightmare, Surreal building, Surreal associations, surreal flight, surreal dreams, shared subconscious, Subconsciousness, Subconscious world, mystical landscape, surreal animation, surrealism, dreamlike reality, Surreal imagery, surrealist, surreal vision, action adventure, surreal adventure, mysticism, imaginary world, Kafkaesque nightmare, Horrifying nightmare, surreal nightmares, bad dream, wild landscapes, wilderness, desolate landscapes, wild nature, extraordinary landscapes, wilderness captivity, Wilderness lodge, Artificial world, Mobile Nightmare, mystical brew, Psychedelic visions, Psychedelicized, mythological, Unfamiliar landscape, subconscious nightmares, Escalating nightmare, Surreal landscapes, contrasting landscapes, Surreal Voyage, turbulent dreams, morbid dreams]"
4,55,"[secret observation, secret relationship, dark stories, dark secrets, dark fantasies, secret romance, secret love, Uncovered secrets, Dirty secrets, secret mission, secret identity, secret task, unfulfilled desires, unresolved feelings, hidden secrets, shocking secrets, forbidden secrets, unsettling secrets, [secret, secrets], past secrets, hidden, secrets exposed, hidden feelings, dark motives, dark history, Ancient secrets, forbidden room, forbidden desires, Old secret, Dirty secret, Secret passion, secret desires, Secret desire, invisibility, invisible, secret recipe, secret daughters, secret identities, secret living, secret world, unresolved trauma, unresolved grief, dark secret, dark truth, secret knowledge, Unspoken Desires, Hidden desire, Secret dish, children's secrets, buried truth, secret past, secret agenda, secret online life, secret revealed, disturbing secret]"


Gamma value: 4.0


Unnamed: 0,themeCount,themes
0,65,"[Women's work, women's community, women's crafts, estranged mother, estranged parents, missing mother, Mom, [mother, mothers], missing daughter, estranged daughter, Daughter's struggle, missing sister, Women's support, [mother's support, mothers' support], ailing mother, lost mother, Devoted mother, alcoholic mother, Irish mother, mother's departure, mother's sacrifice, mother's return, birth mother, mother's death, loss of mother, mother's passing, deceased mother, mother disappears, mother's debt, mother's loss, father's death, missing mom, deceased father, mother's son, mother's life, Maternal, motherhood, mother's, separated, estranged, mother's love, mother's quest, Estranged son, maternal love, new mother, motherhood struggles, mother-daughter, Mother-daughter vacation, estranged sisters, estranged friends, young mother, Great Mother, mother's sacrifices, mother's absence, estrangement, Suicidal mother, grieving mother, Absent mother, mother's struggle, Mothers and daughters, Struggling mother, mother's vow, postpartum anxiety, postpartum struggles, Irish life]"
1,55,"[secret observation, secret relationship, dark stories, dark secrets, dark fantasies, secret romance, secret love, Uncovered secrets, Dirty secrets, secret mission, secret identity, secret task, unfulfilled desires, unresolved feelings, hidden secrets, shocking secrets, forbidden secrets, unsettling secrets, [secret, secrets], past secrets, hidden, secrets exposed, hidden feelings, dark motives, dark history, Ancient secrets, forbidden room, forbidden desires, Old secret, Dirty secret, Secret passion, secret desires, Secret desire, invisibility, invisible, secret recipe, secret daughters, secret identities, secret living, secret world, unresolved trauma, unresolved grief, dark secret, dark truth, secret knowledge, Unspoken Desires, Hidden desire, Secret dish, children's secrets, buried truth, secret past, secret agenda, secret online life, secret revealed, disturbing secret]"
2,55,"[Queer visions, Queer thoughts, queer art, Queer legacy, Queer visibility, Queer Hero, queer media, queer community, queer culture, Queer comedy, queer woman, queer identity, queer love, queer, aging queer, genderqueer, queer youth, queer life, queer horror, queer history, queer girl, queercore, queer artist, Queer joy, queer nightlife, queerness, LGBTQIA+, Queer lovers, Queer Sexuality, Queer mom, Queer eroticism, queer role model, queer experience, queer women, gender neutral, LGBTQIA+ experiences, LGBTQ history, queer liberation, Queer journey, sexuality, homosexuality, homosexual romance, homoerotic, queer romance, Queer subculture, Queer mecca, queer struggle, queer representations, queer activism, LGBTQI+ community, Queer struggles, Queer Scene, Queer genealogy, Queer existences, Queer utopias]"
3,54,"[strange event, strange presence, strange encounter, weird incident, Weird incidents, bizarre incident, strange artifact, strange figure, Absurd encounters, unexpected encounters, Brutal crime, Brutal executions, brutal murders, Disturbing crime, Unusual occurrence, Bizarre phenomena, strange occurrences, Disturbing past, Disturbing events, Disturbing event, Eerie relationship, eerie encounters, spiritual encounters, mysterious encounters, surreal encounters, spiritual medium, Disturbing presence, uncanny presence, serendipitous encounter, unexpected encounter, unexpected friendship, Random encounters, First Encounter, Strange existence, Eccentric stranger, strange person, Disturbing truth, uncanny, gruesome deaths, unlikely friendship, strange murders, strange woman, strange bird, Unseen presence, Disturbing encounter, unsolved murders, unexplained events, alien encounter, disturbing book, Horrific crime, Strange company, strange action, ghostly presences, strange acquaintances]"
4,49,"[Orphan son, orphan's journey, elderly inmates, elderly, elderly woman, Elderly spinster, elderly Dorothy, Elderly widow, elderly love, elderly lady, elderly maid, elderly mother, elderly student, older woman, elderly gentleman, elderly man, Widow's heart, [widow, widows], widow's journey, Widow's plight, Elderly uncle, Elderly father, Elderly daughter, elderly care, [senior, seniors], aging parents, elderly life, elderly parent, old age home, Aged care, Grieving widow, widower, Widowed father, [elder, elders], Elderly guardian, Widowhood, widowed mother, widowed, Fatherless journey, old man, grandfather, Older man, senior woman, Senior citizen, old lady, Elderly priest, grandson, elderly couple, elderly wrestler]"


Gamma value: 8.0


Unnamed: 0,themeCount,themes
0,53,"[Queer visions, Queer thoughts, queer art, Queer legacy, Queer visibility, Queer Hero, queer media, queer community, queer culture, Queer comedy, queer woman, queer identity, queer love, queer, aging queer, genderqueer, queer youth, queer life, queer horror, queer history, queer girl, queercore, queer artist, Queer joy, queer nightlife, queerness, Queer lovers, Queer Sexuality, Queer mom, Queer eroticism, queer role model, queer experience, queer women, gender neutral, LGBTQIA+ experiences, LGBTQ history, queer liberation, Queer journey, sexuality, homosexuality, homosexual romance, homoerotic, queer romance, Queer subculture, Queer mecca, queer struggle, queer representations, queer activism, Queer struggles, Queer Scene, Queer genealogy, Queer existences, Queer utopias]"
1,40,"[parents' home, family home, family agency, family provider, three families, [families, family], family secrets, Familial secrets, family memory, family ties, family traditions, family bonds, family roots, familial relationships, family relationships, family holiday, family wedding, family love, family life, Family pain, family values, family protection, family support, family tradition, family business, family bonding, Female bonding, family vacation home, family man, vacation friends, Family survival, family dynamics, Traditional values, conservative values, traditional family, family history, family dignity, Conservative views, family legend, family divisions]"
2,36,"[Dramedy, [comedies, comedy], drama, courtroom drama, Chaotic comedy, Farcical comedy, Hilarious chaos, Veteran comedian, comedy legend, comedy of errors, drama teacher, drama graduate, psychological thriller, psychological drama, [comedy rock, comedy-rock], Comedy awards, meta comedy, comedy roast, Comedy movie, comedy performance, comedy world, comedic love, comedy tour, musical comedy, historical drama, political drama, Gritty drama, Comedy showcase, psychological game, comedy special, folk comedy, live comedy, raw humor, flat humor, Comedy Cellar, action-comedy]"
3,36,"[elderly inmates, elderly, elderly woman, Elderly spinster, elderly Dorothy, Elderly widow, elderly love, elderly lady, elderly maid, elderly mother, elderly student, older woman, elderly gentleman, elderly man, Elderly uncle, Elderly father, Elderly daughter, elderly care, [senior, seniors], aging parents, elderly life, elderly parent, old age home, Aged care, [elder, elders], Elderly guardian, old man, grandfather, Older man, senior woman, Senior citizen, old lady, Elderly priest, grandson, elderly couple, elderly wrestler]"
4,36,"[strange phenomenon, strange events, strange phenomena, strange performance, mysterious deaths, mysterious events, enigmatic, enigmatic key, enigmatic figure, [enigma, enigmas], enigmatic woman, enigmatic past, mysterious object, Mysterious phenomenon, enigmatic castle, Enigmatic Writer, Unnatural occurrences, Unexplainable events, mysterious past, enigmatic event, mysterious incident, mysterious celebration, Weird events, strange encounters, strange things, strange couple, strange behavior, Inexplicable, strange places, bizarre, strange, weird, strange object, strange town, strange terrain, unexplained]"


Gamma value: 16.0


Unnamed: 0,themeCount,themes
0,40,"[parents' home, family home, family agency, family provider, three families, [families, family], family secrets, Familial secrets, family memory, family ties, family traditions, family bonds, family roots, familial relationships, family relationships, family holiday, family wedding, family love, family life, Family pain, family values, family protection, family support, family tradition, family business, family bonding, Female bonding, family vacation home, family man, vacation friends, Family survival, family dynamics, Traditional values, conservative values, traditional family, family history, family dignity, Conservative views, family legend, family divisions]"
1,36,"[strange phenomenon, strange events, strange phenomena, strange performance, mysterious deaths, mysterious events, enigmatic, enigmatic key, enigmatic figure, [enigma, enigmas], enigmatic woman, enigmatic past, mysterious object, Mysterious phenomenon, enigmatic castle, Enigmatic Writer, Unnatural occurrences, Unexplainable events, mysterious past, enigmatic event, mysterious incident, mysterious celebration, Weird events, strange encounters, strange things, strange couple, strange behavior, Inexplicable, strange places, bizarre, strange, weird, strange object, strange town, strange terrain, unexplained]"
2,32,"[nightmare begins, [nightmare, nightmares], Surreal fantasy, Surreal dream, surreal, Surreal comedy, Surreal visions, surreal nightmare, Surreal building, Surreal associations, surreal flight, surreal dreams, surreal animation, surrealism, dreamlike reality, Surreal imagery, surrealist, surreal vision, action adventure, surreal adventure, Kafkaesque nightmare, Horrifying nightmare, surreal nightmares, bad dream, Mobile Nightmare, Psychedelic visions, Psychedelicized, subconscious nightmares, Escalating nightmare, Surreal Voyage, turbulent dreams, morbid dreams]"
3,32,"[Dark realities, Dark interest, Dark realm, dark presence, Dark pasts, Darker truth, dark characters, inner darkness, dark place, darkness, Darkness beneath, lurking darkness, dark abyss, dark energy, dark force, dark, Deep dark, Dark Prison, dark moon, Dark Highway, dark house, dark room, dark themes, dark time, dark twist, dark scheme, Darkness to light, Dark Light, darkness vs. light, dark thoughts, Dark Void, dark tunnel]"
4,32,"[broken family, family suffering, Parental Murder, family murder, family dysfunction, family struggles, family chaos, Family fragments, Interpersonal conflicts, family conflicts, family desperation, family problems, family struggle, family issues, family turmoil, family stability, family pressures, Family interference, Family rift, family disconnection, family tragedy, family death, Family deaths, family estrangement, family separation, Family collapse, personal problems, family trauma, family conflict, family tension, environmental trauma, emotional trauma]"


Gamma value: 32.0


Unnamed: 0,themeCount,themes
0,33,"[parents' home, family home, family agency, family provider, three families, [families, family], family ties, Family fragments, family bonds, family roots, familial relationships, family relationships, family holiday, family wedding, family love, family life, Family pain, family values, family protection, family support, Family interference, Family rift, family vacation home, family man, vacation friends, Family survival, family dynamics, Traditional values, conservative values, traditional family, family dignity, Conservative views, family divisions]"
1,32,"[Dark realities, Dark interest, Dark realm, dark presence, Dark pasts, Darker truth, dark characters, inner darkness, dark place, darkness, Darkness beneath, lurking darkness, dark abyss, dark energy, dark force, dark, Deep dark, Dark Prison, dark moon, Dark Highway, dark house, dark room, dark themes, dark time, dark twist, dark scheme, Darkness to light, Dark Light, darkness vs. light, dark thoughts, Dark Void, dark tunnel]"
2,30,"[Mom, [mother, mothers], Daughter's struggle, ailing mother, lost mother, Devoted mother, alcoholic mother, Irish mother, mother's departure, mother's sacrifice, mother's return, birth mother, Maternal, motherhood, mother's, mother's love, mother's quest, maternal love, new mother, motherhood struggles, mother-daughter, Mother-daughter vacation, young mother, Great Mother, mother's struggle, Mothers and daughters, Struggling mother, postpartum anxiety, postpartum struggles, Irish life]"
3,28,"[Christmas magic, Christmas competition, christmas, Christmas traditions, Christmas celebration, Christmas decorations, Christmas spirit, Christmas mystery, Christmas story, Christmas tree, Xmas, Christmas ambience, Christmas terror, Christmas love, Christmas songs, Christmas nightmare, Christmas curse, Christmas wish, Christmas proposal, Christmas gift, Christmas carols, Christmas Eve wedding, Christmas break, Christmas Variety, Christmas setting, Christmas trees, Family Christmas, Christmas romance]"
4,28,"[strange phenomenon, strange events, strange phenomena, strange performance, mysterious deaths, mysterious events, mysterious object, Mysterious phenomenon, Unnatural occurrences, Unexplainable events, mysterious past, enigmatic event, mysterious incident, mysterious celebration, Weird events, strange encounters, strange things, strange couple, strange behavior, Inexplicable, strange places, bizarre, strange, weird, strange object, strange town, strange terrain, unexplained]"


Gamma value: 64.0


Unnamed: 0,themeCount,themes
0,25,"[indigenous peoples, Indigenous perspectives, Indigenous cultures, Indigenous scientists, indigenous culture, Aboriginal culture, indigenous community, Indigenous heritage, indigenous folklore, indigenous people, Indigenous identity, Indigenous trauma, Indigenous woman, Indigenous migration, Indigenous activism, Indigenous women, Indigenous sovereignty, Afro-Indigenous, Indigenous Australians, Indigenous roots, Indigenous history, indigenous territory, Indigenous language, intangible heritage, Indigenous beliefs]"
1,25,"[Queer visions, Queer thoughts, queer art, queer, aging queer, genderqueer, queer youth, queer life, queer horror, queer history, queer girl, queer artist, queer nightlife, queerness, Queer lovers, Queer Sexuality, Queer mom, Queer eroticism, gender neutral, Queer mecca, queer representations, Queer struggles, Queer genealogy, Queer existences, Queer utopias]"
2,23,"[art film, artist film, experimental film, Italian Filmmaker, experimental filmmaker, documentary filmmaker, experimental filmmakers, Experimental films, unfinished film, experimental cinema, [short film, short-film, shortfilm], Dedicated filmmaker, Missing filmmaker, Avant-garde cinema, budding filmmaker, Home film, original film, film within a film, silent film, flicker film, touching film, girl filmmaker, aspiring filmmaker]"
3,22,"[Scary Tales, horror stories, teen horror, Real horror, [horror, horrors], body horror, psycho-horror, horror movie, horror convention, horror footage, Horror filming, horror film, Horror setting, Creeping horror, horror films, [horror comedy, horror-comedy], horror genre, Horror classic, Horror homage, horror realm, horror franchise, Horror legend]"
4,22,"[Christmas magic, christmas, Christmas traditions, Christmas spirit, Christmas mystery, Christmas story, Christmas tree, Xmas, Christmas ambience, Christmas terror, Christmas love, Christmas songs, Christmas nightmare, Christmas curse, Christmas wish, Christmas proposal, Christmas gift, Christmas break, Christmas Variety, Christmas trees, Family Christmas, Christmas romance]"


Gamma value: 128.0


Unnamed: 0,themeCount,themes
0,19,"[queer, aging queer, genderqueer, queer youth, queer life, queer horror, queer history, queer girl, queer artist, queer nightlife, queerness, Queer lovers, Queer Sexuality, Queer mom, Queer eroticism, gender neutral, Queer mecca, Queer struggles, Queer genealogy]"
1,18,"[1970, 1960s-1970s, 70s, 1970s, 1970s counterculture, 1970s Rome, 1970s era, seventies, 1970s Iran, 1970s Britain, 1960s, 1960s setting, 1960s NYC, 60s, 1976-1980, 60's, 70s glory, Eighties]"
2,18,"[indigenous peoples, Indigenous perspectives, Indigenous cultures, Indigenous scientists, indigenous culture, Aboriginal culture, Indigenous heritage, indigenous folklore, Indigenous identity, Indigenous trauma, Indigenous woman, Indigenous migration, Indigenous Australians, Indigenous roots, Indigenous history, Indigenous language, intangible heritage, Indigenous beliefs]"
3,17,"[Ancient identity, Ancient, Ancient power, ancient style, ancient world, ancient professions, ancient tradition, otherworldly powers, ancient powers, ancient curses, Ancient Crown, ancient origins, ancient myth, ancient literature, ancient spirit, ancient practice, Creation myth]"
4,17,"[elderly inmates, elderly, elderly student, elderly gentleman, elderly man, elderly care, [senior, seniors], aging parents, elderly life, old age home, Aged care, old man, grandfather, Older man, grandson, elderly couple, elderly wrestler]"


Gamma value: 256.0


Unnamed: 0,themeCount,themes
0,15,"[Twisted thriller, [twist, twists], Twisted love, twisted games, Unexpected turns, unexpected twist, twisted lesson, twisted, Strange twist, Twist ending, unexpected ending, gut-wrenching twists, time twist, harsh twist, twisted battle]"
1,14,"[alien creature, mysterious creature, alien monster, classic monsters, [monster, monsters], giant monster, giant kraken, human monster, forest monster, miniature monsters, Monstro, deformed monster, animated monsters, Monster within]"
2,14,"[Mom, [mother, mothers], birth mother, Maternal, motherhood, mother's, mother's love, maternal love, new mother, mother-daughter, Mother-daughter vacation, young mother, Great Mother, Mothers and daughters]"
3,14,"[[comedies, comedy], comedy of errors, [comedy rock, comedy-rock], meta comedy, comedy roast, Comedy movie, comedy performance, comedy world, comedic love, comedy tour, musical comedy, folk comedy, live comedy, action-comedy]"
4,14,"[1970, 1960s-1970s, 1970s, 1970s counterculture, 1970s Rome, 1970s era, 1970s Iran, 1970s Britain, 1960s, 1960s setting, 1960s NYC, 60s, 1976-1980, 60's]"


Gamma value: 512.0


Unnamed: 0,themeCount,themes
0,12,"[ordinary life, routine life, structured life, home life, traditional life, easy life, monotonous life, quiet life, Busy life, modest life, simple life, Carefree life]"
1,11,"[Haunted chapel, Haunted market, Haunted temple, Haunted terrace, Haunted carriage, Haunted manor, Haunted artifact, haunted hotel, Haunted pizzeria, haunted building, Gothic manor]"
2,11,"[[comedies, comedy], Veteran comedian, comedy legend, comedy of errors, [comedy rock, comedy-rock], meta comedy, Comedy movie, comedy world, comedic love, comedy tour, musical comedy]"
3,10,"[future world, future dystopia, future warfare, dystopian future, future catastrophe, Future crime, planetary future, future past, ancient future, chaotic future]"
4,10,"[childhood, childhood friends, childhood roots, childhood game, childhood joy, childhood contemplation, childhood body, childhood photo, infância, childhood passion]"


Gamma value: 1024.0


Unnamed: 0,themeCount,themes
0,9,"[novice dancer, professional dancer, go-go dancer, American dancer, Avant-garde dancer, dancer-choreographer, choreography, Dance diva, exotic dancer]"
1,9,"[parents' home, family home, family agency, family provider, three families, [families, family], family vacation home, family man, vacation friends]"
2,9,"[Wealth privilege, Wealthy man, wealthy, Wealthy lifestyles, Wealthy couple, Wealthy married woman, wealthy men, wealthy schoolkids, Wealth obsession]"
3,8,"[college memories, College move, college days, college enrollment, college sweetheart, college romance, college classmates, college fraternity]"
4,8,"[unfinished film, [short film, short-film, shortfilm], Home film, original film, film within a film, silent film, flicker film, touching film]"


## For individual themes, see what other themes drop out of their communities as you increase gamma

In [60]:
def get_proprety_df(property_name):
    property_df = gds.graph.nodeProperties.stream(g_themes2,
                                                  [property_name],
                                                  db_node_properties = ["description", "descriptions"],
                                                  separate_property_columns=True)
    property_df.loc[pd.notna(property_df['descriptions']), 'descriptions'] = property_df.loc[pd.notna(property_df['descriptions']), 'descriptions'].map(lambda x: ", ".join(x))
    property_df['description'] = property_df.apply(lambda row: row['description'] if row['description'] else row['descriptions'], axis=1)
    property_df.drop('descriptions', axis=1, inplace=True)
    return property_df

property_dfs = {property_name: get_proprety_df(property_name)
                for property_name in [f"leidenGamma{gamma}" for gamma in [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0]]}

In [76]:
def get_cluster_for_theme(property_name, theme):
    theme_df = property_dfs[property_name]
    result_list = []
    if theme in theme_df['description'].to_list():
        theme_cluster_id = theme_df.loc[theme_df['description']==theme, property_name].iloc[0]
        result_list = theme_df.loc[theme_df[property_name]==theme_cluster_id, "description"].to_list()
    return result_list

In [77]:
def compare_gamma_for_theme(theme, gammas):
    column_names = [f"leidenGamma{gamma}" for gamma in gammas]
    lists = [get_cluster_for_theme(column, theme) for column in column_names]
    output_df = pd.DataFrame()
    output_df.loc[column_names[0], "final_themes"] = ", ".join(lists[0])
    for i in range(1, len(gammas)):
        output_df.loc[f"{column_names[i-1]} to {column_names[i]}", "dropped_themes"] = ", ".join([theme for theme in lists[i-1] if theme not in lists[i]])
        output_df.loc[f"{column_names[i-1]} to {column_names[i]}", "added_themes"] = ", ".join([theme for theme in lists[i] if theme not in lists[i-1]])
        output_df.loc[f"{column_names[i-1]} to {column_names[i]}", "final_themes"] = ", ".join(lists[i])
    return output_df

In [78]:
compare_gamma_for_theme("christmas", [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0])

Unnamed: 0,final_themes,dropped_themes,added_themes
leidenGamma32.0,"Christmas magic, Christmas competition, christmas, Christmas traditions, Christmas celebration, Christmas decorations, Christmas spirit, Christmas mystery, Christmas story, Christmas tree, Xmas, Christmas ambience, Christmas terror, Christmas love, Christmas songs, Christmas nightmare, Christmas curse, Christmas wish, Christmas proposal, Christmas gift, Christmas carols, Christmas Eve wedding, Christmas break, Christmas Variety, Christmas setting, Christmas trees, Family Christmas, Christmas romance",,
leidenGamma32.0 to leidenGamma64.0,"Christmas magic, christmas, Christmas traditions, Christmas spirit, Christmas mystery, Christmas story, Christmas tree, Xmas, Christmas ambience, Christmas terror, Christmas love, Christmas songs, Christmas nightmare, Christmas curse, Christmas wish, Christmas proposal, Christmas gift, Christmas break, Christmas Variety, Christmas trees, Family Christmas, Christmas romance","Christmas competition, Christmas celebration, Christmas decorations, Christmas carols, Christmas Eve wedding, Christmas setting",
leidenGamma64.0 to leidenGamma128.0,"christmas, Xmas","Christmas magic, Christmas traditions, Christmas spirit, Christmas mystery, Christmas story, Christmas tree, Christmas ambience, Christmas terror, Christmas love, Christmas songs, Christmas nightmare, Christmas curse, Christmas wish, Christmas proposal, Christmas gift, Christmas break, Christmas Variety, Christmas trees, Family Christmas, Christmas romance",
leidenGamma128.0 to leidenGamma256.0,"christmas, Xmas",,
leidenGamma256.0 to leidenGamma512.0,"christmas, Xmas",,
leidenGamma512.0 to leidenGamma1024.0,"christmas, Xmas",,


In [79]:
compare_gamma_for_theme("Beatles", [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0])

Unnamed: 0,final_themes,dropped_themes,added_themes
leidenGamma32.0,"The Beatles, Beatles",,
leidenGamma32.0 to leidenGamma64.0,"The Beatles, Beatles",,
leidenGamma64.0 to leidenGamma128.0,"The Beatles, Beatles",,
leidenGamma128.0 to leidenGamma256.0,"The Beatles, Beatles",,
leidenGamma256.0 to leidenGamma512.0,"The Beatles, Beatles",,
leidenGamma512.0 to leidenGamma1024.0,"The Beatles, Beatles",,


In [81]:
compare_gamma_for_theme("Family bond", [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0])

Unnamed: 0,final_themes,dropped_themes,added_themes
leidenGamma32.0,"Family baggage, Family mystery, familial bonds, Family bond, Family antics, Family adventure, family secret, Family conversations, Family story, family expectations, family reflections, family move, family journey, family visit, Family culture",,
leidenGamma32.0 to leidenGamma64.0,"Family baggage, Family mystery, familial bonds, Family bond, Family antics, Family adventure, family secret, Family conversations, Family story, family expectations, family reflections, family move, family journey, family visit, Family culture",,
leidenGamma64.0 to leidenGamma128.0,"Family baggage, Family mystery, familial bonds, Family bond, Family antics, Family adventure, family secret, Family conversations, Family story, family expectations, family reflections, family move, family journey, family visit, Family culture",,
leidenGamma128.0 to leidenGamma256.0,"Family baggage, Family mystery, familial bonds, Family bond, Family antics, family secret, Family conversations, Family story","Family adventure, family expectations, family reflections, family move, family journey, family visit, Family culture",
leidenGamma256.0 to leidenGamma512.0,"Family baggage, Family mystery, familial bonds, Family bond, Family antics, family secret, Family conversations, Family story",,
leidenGamma512.0 to leidenGamma1024.0,"familial bonds, Family bond, Family conversations","Family baggage, Family mystery, Family antics, family secret, Family story",


In [82]:
compare_gamma_for_theme("dark place", [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0])

Unnamed: 0,final_themes,dropped_themes,added_themes
leidenGamma32.0,"Dark realities, Dark interest, Dark realm, dark presence, Dark pasts, Darker truth, dark characters, inner darkness, dark place, darkness, Darkness beneath, lurking darkness, dark abyss, dark energy, dark force, dark, Deep dark, Dark Prison, dark moon, Dark Highway, dark house, dark room, dark themes, dark time, dark twist, dark scheme, Darkness to light, Dark Light, darkness vs. light, dark thoughts, Dark Void, dark tunnel",,
leidenGamma32.0 to leidenGamma64.0,"inner darkness, dark place, darkness, Darkness beneath, lurking darkness, dark abyss, dark energy, dark force, dark, Deep dark, dark moon, dark house, dark room, dark themes, dark time, dark twist, dark scheme, darkness vs. light, dark thoughts, dark tunnel","Dark realities, Dark interest, Dark realm, dark presence, Dark pasts, Darker truth, dark characters, Dark Prison, Dark Highway, Darkness to light, Dark Light, Dark Void",
leidenGamma64.0 to leidenGamma128.0,"inner darkness, dark place, darkness, Darkness beneath, lurking darkness, dark abyss, dark moon, dark house, dark room, dark themes, dark time, dark twist, dark scheme, darkness vs. light, dark thoughts, dark tunnel","dark energy, dark force, dark, Deep dark",
leidenGamma128.0 to leidenGamma256.0,"dark place, dark abyss, dark moon, dark house, dark room, dark themes, dark time, dark twist, dark scheme, dark thoughts, dark tunnel","inner darkness, darkness, Darkness beneath, lurking darkness, darkness vs. light",
leidenGamma256.0 to leidenGamma512.0,"dark place, dark abyss, dark house, dark room, dark twist, dark scheme","dark moon, dark themes, dark time, dark thoughts, dark tunnel",
leidenGamma512.0 to leidenGamma1024.0,"dark place, dark house, dark room, dark twist, dark scheme",dark abyss,


In [83]:
compare_gamma_for_theme("lgbtq, lgbtq+", [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0])

Unnamed: 0,final_themes,dropped_themes,added_themes
leidenGamma32.0,"lgbtq identity, lgbtq+ identity, Gay dating, Gay community, lgbtq, lgbtq+, LGBTQ struggles, LGBTQIA+ community, LGBTQ2IA+, Gay/Lesbian, LGBTQ activism, LGBTQ+ pageant",,
leidenGamma32.0 to leidenGamma64.0,"lgbtq identity, lgbtq+ identity, Gay dating, Gay community, lgbtq, lgbtq+, LGBTQ struggles, LGBTQIA+ community, LGBTQ2IA+, Gay/Lesbian, LGBTQ activism, LGBTQ+ pageant",,
leidenGamma64.0 to leidenGamma128.0,"lgbtq identity, lgbtq+ identity, lgbtq, lgbtq+, LGBTQ struggles, LGBTQIA+ community, LGBTQ2IA+, Gay/Lesbian, LGBTQ activism, LGBTQ+ pageant","Gay dating, Gay community",
leidenGamma128.0 to leidenGamma256.0,"lgbtq identity, lgbtq+ identity, lgbtq, lgbtq+, LGBTQ struggles, LGBTQIA+ community, LGBTQ2IA+, Gay/Lesbian, LGBTQ activism, LGBTQ+ pageant",,
leidenGamma256.0 to leidenGamma512.0,"lgbtq, lgbtq+, LGBTQIA+ community, LGBTQ2IA+, Gay/Lesbian, LGBTQ+ pageant","lgbtq identity, lgbtq+ identity, LGBTQ struggles, LGBTQ activism",
leidenGamma512.0 to leidenGamma1024.0,"lgbtq, lgbtq+, Gay/Lesbian, LGBTQ+ pageant","LGBTQIA+ community, LGBTQ2IA+",


In [84]:
compare_gamma_for_theme("friendship, friendships", [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0])

Unnamed: 0,final_themes,dropped_themes,added_themes
leidenGamma32.0,"best friend, strong friendship, childhood friendship, male friendship, intergenerational friendship, female friendship, girl friendships, unusual friendship, friendship, friendships, friendship bond",,
leidenGamma32.0 to leidenGamma64.0,"best friend, strong friendship, male friendship, female friendship, girl friendships, unusual friendship, friendship, friendships, friendship bond","childhood friendship, intergenerational friendship",
leidenGamma64.0 to leidenGamma128.0,"best friend, strong friendship, male friendship, female friendship, girl friendships, unusual friendship, friendship, friendships, friendship bond",,
leidenGamma128.0 to leidenGamma256.0,"best friend, strong friendship, unusual friendship, friendship, friendships, friendship bond","male friendship, female friendship, girl friendships",
leidenGamma256.0 to leidenGamma512.0,"best friend, strong friendship, unusual friendship, friendship, friendships, friendship bond",,
leidenGamma512.0 to leidenGamma1024.0,"friendship, friendships, friendship bond","best friend, strong friendship, unusual friendship",


In [86]:
compare_gamma_for_theme("high school friends", [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0])

Unnamed: 0,final_themes,dropped_themes,added_themes
leidenGamma32.0,"best friends, school friends, close friends, high school friends, high school, high-school, highschool, college friends, High school hierarchy, rich friends, friend group",,
leidenGamma32.0 to leidenGamma64.0,"best friends, school friends, close friends, high school friends, high school, high-school, highschool, college friends, High school hierarchy, rich friends, friend group",,
leidenGamma64.0 to leidenGamma128.0,"school friends, high school friends, high school, high-school, highschool, college friends, High school hierarchy, friend group","best friends, close friends, rich friends",
leidenGamma128.0 to leidenGamma256.0,"school friends, high school friends, high school, high-school, highschool, college friends, High school hierarchy, friend group",,
leidenGamma256.0 to leidenGamma512.0,"school friends, high school friends, college friends, friend group","high school, high-school, highschool, High school hierarchy",
leidenGamma512.0 to leidenGamma1024.0,"school friends, high school friends, college friends, friend group",,


In [88]:
compare_gamma_for_theme("musician, musicians", [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0])

Unnamed: 0,final_themes,dropped_themes,added_themes
leidenGamma32.0,"Musician's journey, musician, musicians, Old musician, cult musician, busker, street musician, LA musician, guitarist, troubled musician",,
leidenGamma32.0 to leidenGamma64.0,"Musician's journey, musician, musicians, Old musician, cult musician, busker, street musician, LA musician, guitarist, troubled musician",,
leidenGamma64.0 to leidenGamma128.0,"Musician's journey, musician, musicians, Old musician, cult musician, busker, street musician, LA musician, guitarist, troubled musician",,
leidenGamma128.0 to leidenGamma256.0,"Musician's journey, musician, musicians, Old musician, cult musician, busker, street musician, LA musician, guitarist, troubled musician",,
leidenGamma256.0 to leidenGamma512.0,"Musician's journey, musician, musicians, Old musician, cult musician, busker, street musician, LA musician, guitarist, troubled musician",,
leidenGamma512.0 to leidenGamma1024.0,"Musician's journey, musician, musicians, LA musician, guitarist","Old musician, cult musician, busker, street musician, troubled musician",


In [89]:
compare_gamma_for_theme("horse, horses", [32.0, 64.0, 128.0, 256.0, 512.0, 1024.0])

Unnamed: 0,final_themes,dropped_themes,added_themes
leidenGamma32.0,"horse, horses, horse dreams, horseback, white horses, real horse",,
leidenGamma32.0 to leidenGamma64.0,"horse, horses, horse dreams, horseback, white horses, real horse",,
leidenGamma64.0 to leidenGamma128.0,"horse, horses, horse dreams, horseback, white horses, real horse",,
leidenGamma128.0 to leidenGamma256.0,"horse, horses, horse dreams, horseback, white horses, real horse",,
leidenGamma256.0 to leidenGamma512.0,"horse, horses, horse dreams, horseback, white horses, real horse",,
leidenGamma512.0 to leidenGamma1024.0,"horse, horses, horse dreams, horseback, white horses, real horse",,


# Write best Leiden communities to Neo4j
Based on the examples above, 512.0 seems like a good level for gamma.
Rename the property LeidenGamma32.0 from the graph projection to leidenId in the on-disk graph.
Drop any leidenId values from a previous run.

In [91]:
gds.run_cypher("MATCH (t:Theme|Stem) SET t.leidenId = null")
gds.graph.nodeProperties.write(g_themes2, {"leidenGamma512.0": "leidenId"})

writeMillis                       59
graphName            reweight_themes
nodeProperties            [leidenId]
propertiesWritten               9319
Name: 0, dtype: object

Create ThemeGroup nodes connecting themes that are duplicates.

In [92]:
gds.run_cypher("""CREATE CONSTRAINT theme_group_node_key IF NOT EXISTS FOR (n:ThemeGroup) REQUIRE n.id IS NODE KEY""")

In [93]:
gds.run_cypher("""
MATCH (t:Theme)
WHERE t.leidenId IS NOT NULL
WITH t ORDER BY t.description
WITH t.leidenId AS id, collect(t) AS themeNodes, collect(t.description) AS descriptions
MERGE (g:ThemeGroup {id:id}) 
SET g.descriptions = descriptions
WITH g, themeNodes
FOREACH (t in themeNodes | MERGE (t)-[:IN_GROUP]->(g))
""")

In [94]:
gds.run_cypher("""
MATCH (s:Stem)
WHERE s.leidenId IS NOT NULL
CALL
{
WITH s
MERGE (g:ThemeGroup {id:s.leidenId}) 
SET g.descriptions = coalesce(g.descriptions, []) + s.descriptions
WITH s, g
MATCH (s)<-[:HAS_STEM]-(t)
MERGE (t)-[:IN_GROUP]->(g)
}
""")

Create ThemeGroup nodes for Theme nodes that are not already grouped in a theme group if the theme is referenced in more than one document.

In [95]:
gds.run_cypher("""
MATCH (g:ThemeGroup)
WITH max(g.id) AS maxId
MATCH (t:Theme&Groupable)
WHERE NOT EXISTS {(t)-[:IN_GROUP]->()}
WITH maxId, COLLECT(t) AS needsGroup
UNWIND range(0, size(needsGroup)-1) AS idx
WITH maxId, idx, needsGroup[idx] AS t
MERGE (g:ThemeGroup {id:maxId + idx + 1})
SET g.descriptions = [t.description]
WITH g, t
MERGE (t)-[:IN_GROUP]->(g)
RETURN COUNT(*) AS singleThemeGroups
""")

Unnamed: 0,singleThemeGroups
0,28183


In [96]:
gds.run_cypher("""
MATCH (g:ThemeGroup)
WITH max(g.id) AS maxId
MATCH (s:Stem)
WHERE s.leidenId IS NULL
WITH maxId, COLLECT(s) AS needsGroup
UNWIND range(0, size(needsGroup)-1) AS idx
WITH maxId, idx, needsGroup[idx] AS s
CALL
{
WITH s, maxId, idx
MERGE (g:ThemeGroup {id:maxId + idx + 1}) 
SET g.descriptions = s.descriptions
WITH s, g
MATCH (s)<-[:HAS_STEM]-(t)
MERGE (t)-[:IN_GROUP]->(g)
}
""")

For Theme nodes that are not already in a theme group and contain only one document, create ThemeGroup nodes and add a simple summary.

In [None]:
single_doc_df = gds.run_cypher("""
    MATCH (g:ThemeGroup)
    WITH max(g.id) AS maxId
    MATCH (t:Theme) WHERE NOT EXISTS {(t)-[:IN_GROUP]->()} 
    WITH maxId, COLLECT(t) AS needsGroup
    UNWIND range(0, size(needsGroup)-1) AS idx
    WITH maxId, idx, needsGroup[idx] AS t
    CALL
    {
    WITH t, maxId, idx
    MERGE (g:ThemeGroup {id:maxId + idx + 1})
    SET g.descriptions = [t.description]
    SET g.summary = "A movie about " + t.description
    MERGE (t)-[:IN_GROUP]->(g)
    RETURN g
    } IN TRANSACTIONS OF 10000 ROWS
    RETURN g.id AS groupId, g.summary AS summary""")

In [99]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai_api_key)

In [None]:
single_doc_df['embedding'] = embeddings.embed_documents(single_doc_df['summary'].tolist())

In [None]:
gds.run_cypher("""
UNWIND $data AS row
CALL
{
WITH row
MATCH (g:ThemeGroup {id:row['groupId']})
CALL db.create.setNodeVectorProperty(g, 'summaryEmbedding', row['embedding'])
} IN TRANSACTIONS OF 10000 ROWS
""", {"data": single_doc_df[['groupId', 'embedding']].to_dict("records")})

## Test simple summaries for other groups

In [97]:
needs_summary_df = gds.run_cypher("""
MATCH (g:ThemeGroup) WHERE g.summary IS NULL
SET g.summary = 
    CASE WHEN size(g.descriptions) = 1 THEN "Movies about " + g.descriptions[0]
    ELSE reduce(s = "Movies about " + g.descriptions[0], d in g.descriptions[1..-1] | s + ", " + d) + ", and " + g.descriptions[-1]
    END
RETURN g.id AS groupId, g.summary AS summary""")

In [100]:
needs_summary_df['embedding'] = embeddings.embed_documents(needs_summary_df['summary'].tolist())



In [101]:
gds.run_cypher("""
UNWIND $data AS row
CALL
{
WITH row
MATCH (g:ThemeGroup {id:row['groupId']})
CALL db.create.setNodeVectorProperty(g, 'summaryEmbedding', row['embedding'])
} IN TRANSACTIONS OF 10000 ROWS
""", {"data": needs_summary_df[['groupId', 'embedding']].to_dict("records")})

# Clean up projections

In [102]:
gds.graph.drop(g_themes)

graphName                                                                                                                                                                                                                                                                                                                                                                                                                        themes
database                                                                                                                                                                                                                                                                                                                                                                                                                          neo4j
databaseLocation                                                                                                                                        

In [103]:
gds.graph.drop(g_themes2)

graphName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [104]:
gds.run_cypher("MATCH (g:Groupable) REMOVE g:Groupable")

In [105]:
gds.run_cypher("DROP INDEX groupbale_vectors")

# Add mean vector representation to each ThemeGroup

In [106]:
group_embeddings_df = gds.run_cypher("""
MATCH (g:ThemeGroup)<-[:IN_GROUP]-(t)
RETURN g.id AS groupId, collect(t.embedding) AS embeddings
""")

In [107]:
group_embeddings_df['mean_embedding'] = group_embeddings_df.apply(get_mean_vector, axis=1)

In [108]:
gds.run_cypher("""
    UNWIND $data AS row
    CALL
    {
    WITH row
    MATCH (g:ThemeGroup {id: row['groupId']})
    CALL db.create.setNodeVectorProperty(g, "meanEmbedding", row['mean_embedding'])
    } IN TRANSACTIONS OF 10000 ROWS""",
                   {"data":group_embeddings_df[['groupId', 'mean_embedding']].to_dict("records")})


In [109]:
gds.run_cypher("""CREATE VECTOR INDEX theme_group_mean_vectors IF NOT EXISTS 
                  FOR (t:ThemeGroup)
                  ON (t.meanEmbedding)
                  OPTIONS {indexConfig: 
                      {`vector.dimensions`: 1536,
                       `vector.similarity_function`: 'cosine'
                       }}
                       """)