In [None]:
#
# Which two actors starred the most together across the dataset?
#
import pandas as pd
import polars as pl
import numpy as np
import networkx as nx
import json
import time
import os
from rtsvg import *
rt = RACETrack()
ofi = rt.ontologyFrameworkInstance(base_filename='../../../data/kaggle_imdb_600k/20240519_ontology')

In [None]:
# Draw the degree distribution
df = ofi.df_triples.filter(pl.col('vrb') == 'castMemberOf')
g  = rt.createNetworkXGraph(df, [('sbj','obj','vrb')])
degrees = [g.degree(n) for n in g.nodes()]
rt.xy(pl.DataFrame({'degree':degrees}).filter(pl.col('degree') < 100), 'degree', 'degree', 
      render_x_distribution=100, w=1024, distribution_style='inside', distribution_h_perc=1.0)

In [None]:
# Remove 1-Degrees Until There are No One Degrees Left
last_size = len(g.nodes()) + 1
print(f'\n{len(g.nodes())=} | {len(g.edges())=}')
while last_size != len(g.nodes()):
    last_size = len(g.nodes())
    to_be_removed = [x for x in g.nodes() if g.degree(x) <= 1]
    print(f'{len(to_be_removed)}', end='... ')
    g.remove_nodes_from(to_be_removed)
print(f'\n{len(g.nodes())=} | {len(g.edges())=}')

In [None]:
# Re-draw the degree distribution
degrees = [g.degree(n) for n in g.nodes()]
rt.xy(pl.DataFrame({'degree':degrees}).filter(pl.col('degree') < 100), 'degree', 'degree', 
      render_x_distribution=100, w=1024, distribution_style='inside', distribution_h_perc=1.0)

In [None]:
# Build a sorter for degrees for the actor ids
_as_list_ = list(df['sbj'].unique())
_degrees_ = nx.degree(g, _as_list_)
_counts_, _max_degrees_, _sorter_ = 0, 0, []
for x in _degrees_:
    _counts_ += 1
    if _max_degrees_ < x[1]: _max_degrees_ = x[1]
    _sorter_.append((x[1],x[0]))
_sorter_.sort()
_counts_, _max_degrees_

In [None]:
# Do the neighbor intersections from highest degreee to lowest
# ... early terminate if the max intersection found is higher than
#     the node under current focus -- because that means that the
#     nothing lower could exceed the max intersection
# ... this doesn't consider the total number of movies each has been
#     in ... just the ones that have been in the most together...
i       = len(_sorter_)-1
nbor_lu, nbor_calcs_performed = {}, 0
max_intersection_found, actor_id_1, actor_id_2 = 0, 0, 0
while i >= 0:
    nbor_lu[_sorter_[i][1]] = [x for x in nx.neighbors(g, _sorter_[i][1])]
    nbor_calcs_performed += 1
    for j in range(i+1, len(_sorter_)):
        _intersection_ = set(nbor_lu[_sorter_[i][1]]) & set(nbor_lu[_sorter_[j][1]])
        if len(_intersection_) > max_intersection_found:
            max_intersection_found = len(_intersection_)
            actor_id_1 = _sorter_[i][1]
            actor_id_2 = _sorter_[j][1]
    if _sorter_[i][0] < max_intersection_found:
        break
    i -= 1
len(nbor_lu.keys()), len(_sorter_), nbor_calcs_performed

In [None]:
# Display the results
actor_id_1, actor_id_2, max_intersection_found

In [None]:
# Draw the localized graph for those two actors
df_show_it = df.filter((pl.col('sbj') == actor_id_1) | (pl.col('sbj') == actor_id_2))
g_show_it  = rt.createNetworkXGraph(df_show_it, [('sbj','obj','vrb')])
g_pos      = nx.spring_layout(g_show_it)
rt.linkNode(df_show_it, [('sbj','obj','vrb')], g_pos, link_arrow=False, w=1024, h=768)

In [None]:
ofi.uid_lu[actor_id_1], ofi.uid_lu[actor_id_2]

In [None]:
ofi.labeling_uids[actor_id_1], ofi.labeling_uids[actor_id_2]

In [None]:
ofi.labeling_sbjs['nm0046850'], ofi.labeling_sbjs['nm0006982']