In [32]:
import json
import networkx as nx

In [33]:
# Load data
with open("data/tmdb_movies.json", "r", encoding="utf-8") as f:
    movies = json.load(f)

with open("data/recommended_movie_data.json", "r", encoding="utf-8") as f:
    recommended_data = json.load(f)

# Build lookups
recommended_lookup = {movie["id"]: movie for movie in recommended_data}
movie_dict = {movie["id"]: movie for movie in movies}

# Merge recommended movies into a unified list (if not already present)
all_movies = {**movie_dict, **recommended_lookup}

In [34]:
def countries_parser(movie):
    origin_country = movie.get("origin_country", [])

    production_countries = movie.get("production_countries", [])
    production_countries = [country.get("iso_3166_1") for country in production_countries if isinstance(country, dict) and "iso_3166_1" in country]
    
    production_companies = movie.get("production_companies", [])
    production_companies = [company.get("origin_country") for company in production_companies if isinstance(company, dict) and "origin_country" in company]

    all_movie_countries = set()
    all_movie_countries.update([c.lower() for c in origin_country if isinstance(c, str)])
    all_movie_countries.update([c.lower() for c in production_countries if isinstance(c, str)])
    all_movie_countries.update([c.lower() for c in production_companies if isinstance(c, str)])

    return all_movie_countries


def get_all_countries(all_movies):
    all_countries = dict()
    for _, movie in all_movies.items():
        all_movie_countries = countries_parser(movie)

        for c in all_movie_countries:
            country_name = c.lower() if isinstance(c, str) else ""
            if country_name:
                all_countries[country_name] = all_countries.get(country_name, 0) + 1
    return all_countries

def isBrazilianRelated(movie):
    all_countries = countries_parser(movie)
    return True if "br" in all_countries else False


In [35]:
def get_features(movie):
    genres = {g["name"] for g in movie.get("genres", [])}
    cast = {c["id"] for c in movie.get("credits", {}).get("cast", [])}
    crew = {c["id"] for c in movie.get("credits", {}).get("crew", [])}
    keywords = {k["id"] for k in movie.get("keywords", {}).get("keywords", [])}
    companies = {p["id"] for p in movie.get("production_companies", [])}
    countries = {p["iso_3166_1"] for p in movie.get("production_countries", [])}
    return genres, cast, crew, keywords, companies, countries

In [None]:
G = nx.DiGraph()

for movie_id, movie in all_movies.items():
    title = movie.get("title", "").strip()
    release_date = movie.get("release_date", "")
    genres, _, _, keywords, _, countries = get_features(movie)
    
    G.add_node(
        movie_id,
        title=title,
        label=title,
        brazilian=isBrazilianRelated(movie),
        release_date=release_date,
        genres=",".join(genres),
        is_top=(movie_id in movie_dict),
        keywords=",".join(keywords)
    )

In [40]:
for id1, movie1 in all_movies.items():
    features1 = get_features(movie1)
    
    recs = movie1.get("recommendations", {}).get("results", [])
    for rec in recs:
        id2 = rec.get("id")
        if id2 not in all_movies:
            continue

        features2 = get_features(all_movies[id2])

        # Compute shared feature counts
        shared_counts = [
            len(f1 & f2) for f1, f2 in zip(features1, features2)
        ]
        weight = sum(shared_counts)

        if weight > 0:
            G.add_edge(id1, id2, weight=weight, label=str(weight))

In [41]:
nx.write_gexf(G, "networks/movies.gexf")