In [458]:
import os
import gzip
import json
import pandas as pd
import numpy as np
from collections import defaultdict
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
import pandas as pd
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import SentimentIntensityAnalyzer

import netwulf as nw
import networkx as nx
from networkx.readwrite import json_graph

import community as community_louvain
import matplotlib.pyplot as plt

import ast
from collections import defaultdict, Counter
import re

#for louvain algorithm
np.random.seed(seed=100)

In [4]:
with open("network.json", "r") as f:
    graph_data = json.load(f)

Graph = json_graph.node_link_graph(graph_data)

The default value will be changed to `edges="edges" in NetworkX 3.6.


  nx.node_link_graph(data, edges="links") to preserve current behavior, or
  nx.node_link_graph(data, edges="edges") for forward compatibility.


# Partitioning into communities:

In [327]:
partition = community_louvain.best_partition(Graph)

In [None]:

communities = defaultdict(list)
for node, comm_id in partition.items():
    communities[comm_id].append(node)

communities = defaultdict(list, {
    k: v for k, v in communities.items() if len(v) > 10
})

for comm_id, members in communities.items():
    print(f"Community {comm_id}: {members}")

Community 0: ['B0002J58ME', 'B001F76OKC', 'B0042FDCMW', 'B00B5LIE4C', 'B003X2O79W', 'B00BSB9NDK', 'B006IRQU5S', 'B001OSC4FG', 'B00612FR1I', 'B003ULW71Q', 'B000U0C9UE', 'B00012FNBI', 'B000BR4UQ2', 'B000I2JCB8', 'B00012FNB8', 'B003AFPWI2', 'B07P67V245', 'B0089PCXAS', 'B0009WFFPO', 'B00R8HWMEU', 'B00KE77JJ4', 'B005BX3O0O', 'B00049QPII', 'B00FRPK4GI', 'B00HUAGZKW', 'B00247XURC', 'B00KASQQTQ', 'B000RL21OA', 'B003HC9JJ6', 'B00429C1QQ', 'B08MV1RJXG', 'B08M2M6WK2', 'B07K7SSSPM', 'B07KGJX6N7', 'B07KN9TM7K', 'B07KNB6D5F', 'B07G3KRYZK', 'B07DS8M656', 'B076ZS4LLX', 'B0765925D8', 'B00B2TWDWG', 'B00AZ2N906', 'B00IIOLQXU', 'B00QX5FAQA', 'B01GKKI38Y', 'B01BKTG026', 'B0044CM7YC', 'B00FMO7QIS', 'B00PRABXFO', 'B000059H6M', 'B0009WIY4S', 'B00FXCIS90', 'B00OZOIIYM', 'B00E8A0PNK', 'B0061S2T6S', 'B00R0HUEWK', 'B07DB1LKTH', 'B01M6ATZSG', 'B06Y12RK1V', 'B0012YN35M', 'B00MU1YEWW', 'B00007D00L', 'B00467182C', 'B00W85TAHE', 'B005GYXNT0', 'B000WC38CS', 'B00005N8AM', 'B0006BHP9M', 'B00006FMB2', 'B00009YXBK', 'B0000

Community 1 and 2 are still very large so we should split those into subgraphs to try and find more meaningful communities:

In [333]:
def print_community_sizes(community_dict):
    for community_id, asin_list in community_dict.items():
        print(f"Community {community_id}: {len(asin_list)} items")

print_community_sizes(communities)

Community 0: 1050 items
Community 1: 17149 items
Community 2: 13726 items
Community 3: 911 items
Community 4: 615 items
Community 5: 491 items
Community 6: 115 items
Community 7: 223 items


In [334]:
community1_sub_nodes = [node for node, comm in partition.items() if comm == 1]
community2_sub_nodes = [node for node, comm in partition.items() if comm == 2]

# Extract subgraph
subgraph1 = Graph.subgraph(community1_sub_nodes).copy()
subgraph2 = Graph.subgraph(community2_sub_nodes).copy()

# Run Louvain again on the subgraph
subgraph1_communities = community_louvain.best_partition(subgraph1)
subgraph2_communities = community_louvain.best_partition(subgraph2)

In [340]:
# First sub graph:
sub_communities1 = defaultdict(list)
for node, comm_id in subgraph1_communities.items():
    sub_communities1[comm_id].append(node)

sub_communities1 = defaultdict(list, {
    k: v for k, v in sub_communities1.items() if len(v) > 10
})

# Second sub graph:
sub_communities2 = defaultdict(list)
for node, comm_id in subgraph2_communities.items():
    sub_communities2[comm_id].append(node)

sub_communities2 = defaultdict(list, {
    k: v for k, v in sub_communities1.items() if len(v) > 10
})

In [None]:
def merge_communities_sequential(partition, sub_communities1, sub_communities2, min_size=10):
    final = {}
    used_nodes = set()

    # Step 1: Add original communities, excluding 1 and 2
    original_communities = defaultdict(list)
    for node, comm_id in partition.items():
        if comm_id not in {1, 2}:
            original_communities[comm_id].append(node)

    for comm_id, nodes in original_communities.items():
        unique_nodes = [n for n in nodes if n not in used_nodes]
        if len(unique_nodes) >= min_size:
            final[comm_id] = unique_nodes
            used_nodes.update(unique_nodes)

    # Step 2: Add sub-communities from the two splits
    next_id = max(final.keys(), default=0) + 1
    for nodes in list(sub_communities1.values()) + list(sub_communities2.values()):
        unique_nodes = [n for n in nodes if n not in used_nodes]
        if len(unique_nodes) >= min_size:
            final[next_id] = unique_nodes
            used_nodes.update(unique_nodes)
            next_id += 1

    return dict(final)

communities = merge_communities_sequential(
    partition,          # Original
    sub_communities1,   # From splitting community 1 
    sub_communities2,   # From splitting community 2
    min_size=1000
)

#We are left with the 6 largest communities in the graph:
print_community_sizes(communities)


Community 0: 1050 items
Community 1: 1580 items
Community 2: 3928 items
Community 3: 3757 items
Community 4: 2475 items
Community 5: 1669 items
Community 6: 1230 items


# Finding top movie titles in each community based on genre:

In [None]:
#This function parses the categories column, we will use this later to construct a whitelist
#of terms to catch genres:
def extract_unique_category_terms(df, column='categories'):
    all_terms = set()
    
    for raw_entry in df[column].dropna():
        try:
            # If the entry is stringified like a list, we safely convert it back to a list
            parsed = eval(raw_entry)
            if isinstance(parsed, list):
                all_terms.update(parsed)
        except Exception as e:
            print(f"Error parsing entry: {raw_entry}, Error: {e}")
    
    return all_terms

In [358]:
df = pd.read_csv("Merged_Reviews_and_Metadata.csv")
all_category_terms = extract_unique_category_terms(df)

  df = pd.read_csv("Merged_Reviews_and_Metadata.csv")


In [None]:
#These are all unique terms in the column:
print(all_category_terms)

{'Malicious', 'Warner Home Video', 'Joel, Billy', 'Special Interests', 'Nail-Biting', 'All Sony Pictures Titles', 'MTV', '20th Century Fox Home Entertainment', 'Strange', 'Movies & TV Halloween Store', 'Contemplative', 'Introspective', 'Queen', 'Classics Kids Love', 'MGM Home Entertainment', 'Criterion Collection', 'Dreamlike', 'Harrowing', 'Discovery Channel', 'Studio Specials', 'Drama', 'Joyous', 'Fully Loaded DVDs', 'All', 'Anime', 'Monthly Deals', 'Military and War', 'The Comedy Central Store', 'Westerns', 'Dark', 'Adventure', 'Shakespeare on DVD Store', 'Educational', 'Family Features', 'Fox Featured Deals', 'Warner DVD & Blu-ray Deals', 'Exotic', 'All HBO Titles', 'Disney Channel', 'Fitness & Yoga', 'All FX Shows', 'Historical', 'Prince', 'Science Fiction & Fantasy', 'Timeless Holiday Favorites', 'Yoga', 'Television', 'Harry Potter and the Goblet of Fire', 'MOD CreateSpace Video', 'Box Sets', 'Art House & International', 'The Big DVD Sale', 'Oscar Nominees', 'Emmy Nominees', 'Osc

In [None]:
#Helper function to speed up the next code block
def build_asin_to_categories(df):
    """Preprocess the dataframe to create a fast lookup dictionary."""
    asin_to_categories = {}
    for _, row in df.iterrows():
        asin = row['parent_asin']
        try:
            categories = ast.literal_eval(row['categories']) if pd.notna(row['categories']) else []
        except (ValueError, SyntaxError, TypeError):
            categories = []
        asin_to_categories[asin] = categories
    return asin_to_categories

asin_to_categories = build_asin_to_categories(df)

In [None]:
# Using an LLM to parse for genre-like terms + manual curation to get the list:
valid_genres = [
    'action', 'adventure', 'comedy', 'drama', 'fantasy', 'sci-fi', 'science fiction',
    'horror', 'thriller', 'romance', 'mystery', 'animation', 'documentary', 'family', 
    'musicals', 'biography', 'historical', 'crime', 'music', 'war', 'western', 'kids', 
    'educational', 'reality', 'anime', 'fitness', 'yoga', 'exercise', 'sports', 
]

# This function counts the number of matches from valid_genres in the categories column
# for a certain community:
def get_top_genres_for_community(community_asins, asin_to_categories, top_n):
    genre_counter = Counter()

    for asin in community_asins:
        categories = asin_to_categories.get(asin, [])
        for cat in categories:
            cat_lower = cat.lower()
            for genre in valid_genres:
                if genre in cat_lower:
                    genre_counter[genre] += 1

    return dict(genre_counter.most_common(top_n))

# Try it out here by putting in a community id:
# We can also use this code to get some future plots of the actual statistics
# If that makes sense, like make a for loop for each id.
community_id = 1
asin_list = communities[community_id]
top_genres = get_top_genres_for_community(asin_list, asin_to_categories, 10)
print(top_genres)

{'action': 168, 'adventure': 140, 'science fiction': 122, 'comedy': 120, 'drama': 117, 'fantasy': 80, 'family': 77, 'kids': 74, 'war': 71, 'music': 51}


In [None]:
#Future code we will use to make the image plots:

# Load the metadata dataset that contains image URLs
meta_df = pd.read_csv('meta_Movies_and_TV_Full.csv')

asin_to_image = {}

for _, row in meta_df.iterrows():
    asin = row['parent_asin']
    image_data = row['images']
    
    # Check if the image_data is a list of dictionaries
    if isinstance(image_data, str):
        try:
            image_urls = eval(image_data)
            for image in image_urls:
                if isinstance(image, dict):  # Check if it's a dictionary
                    if 'large' in image:
                        asin_to_image[asin] = image['large']
                        break
                    elif '360w' in image:
                        asin_to_image[asin] = 'no image' #These images aren't very good so left out
                        break
        except (ValueError, SyntaxError, TypeError):
            continue

  meta_df = pd.read_csv('meta_Movies_and_TV_Full.csv')


In [None]:
# We can grab the image URL for any ASIN here, only works for "Movies", Prime Video 
# Doesn't have proper images but random stills from the movie, left out for now.

asin_example = 'B001JV5BF8'
image_url = asin_to_image.get(asin_example, 'Invalid Key')
print(f"Image URL for ASIN {asin_example}: {image_url}")

Image URL for ASIN B001JV5BF8: https://m.media-amazon.com/images/I/515qo5iCUzL.jpg


In [None]:
# This code uses regular expressions to filter out titles in order to avoid duplicates
# In the get_top_titles functions:
def normalize_title(title):
    if not isinstance(title, str):
        return ''
    title = title.lower()
    
    # Keep part after comma if it exists and the part after is short (likely the title)
    if ',' in title:
        parts = title.split(',')
        if len(parts[1].strip().split()) <= 4:
            title = parts[1]
    
    title = re.sub(r'\[.*?\]', '', title)              # remove [DVD], [Blu-ray], etc.
    title = re.sub(r'\(.*?edition\)', '', title)       # remove (Special Edition), etc.
    title = re.sub(r'\(.*?format\)', '', title)        # remove other bracketed formats
    title = re.sub(r'[^a-z0-9\s]', '', title)          # remove punctuation
    title = re.sub(r'\s+', ' ', title).strip()         # normalize whitespace
    return title

# Simply returns the top titles, based on the rating_number
# Probably not going to use this function much, but it's nice to have.
def get_top_titles_for_community(community_asins, df, top_n=10):
    df['rating_number'] = df['rating_number'].apply(pd.to_numeric, errors='coerce')
    matching_rows = df[df['parent_asin'].isin(community_asins)]
    
    deduped = matching_rows.sort_values('rating_number', ascending=False).drop_duplicates('parent_asin')
    
    top_rows = deduped.sort_values('rating_number', ascending=False)
    top_rows = top_rows[top_rows['movie_title'].notna()]
    
    seen = set()
    unique_top_titles = []
    
    for _, row in top_rows.iterrows():
        norm_title = normalize_title(row['movie_title'])
        if norm_title not in seen:
            seen.add(norm_title)
            unique_top_titles.append((row['movie_title'], row['rating_number']))
        if len(unique_top_titles) >= top_n:
            break

    return unique_top_titles


In [None]:
# Test it here:
community_id = 10
asin_list = communities[community_id]
top_titles = get_top_titles_for_community(asin_list, df)
for t in top_titles:
    print(t)

('Elf (DVD)', 123483.0)
('Buena Vista Home Video, Hocus Pocus', 85704.0)
('Bohemian Rhapsody [Blu-ray]', 81249.0)
('Once upon a Time in Hollywood', 81153.0)
('News of the World [DVD]', 59395.0)
('The Nightmare Before Christmas', 54779.0)
('Get Out [DVD]', 52876.0)
("Avatar (Three-Disc Extended Collector's Edition)", 52341.0)
('A Beautiful Day in the Neighborhood', 47823.0)
('Fantastic Beasts: The Crimes of Grindelwald (DVD)', 45964.0)


In [None]:
# This code block filters the top tiles WITH the top genre to get a better
# overview of the type of movie in a community. Why? Because the communities generally
# Share genres like action/drama so the same types of movies will otherwise dominate
# in every communities top hits.

def get_top_titles_by_genre_for_community(community_asins, df, asin_to_categories, top_n=10):
    # First get the top genre for this community
    top_genres = get_top_genres_for_community(community_asins, asin_to_categories, 1)
    top_genre = next(iter(top_genres.keys()))
    
    # Get matching rows for this community
    matching_rows = df[df['parent_asin'].isin(community_asins)]
    
    # Filter rows that have the top genre in their categories
    filtered_rows = []
    for _, row in matching_rows.iterrows():
        categories = asin_to_categories.get(row['parent_asin'], [])
        categories_lower = [cat.lower() for cat in categories]
        if any(top_genre in cat for cat in categories_lower):
            filtered_rows.append(row)
    
    # Create a DataFrame from filtered rows
    filtered_df = pd.DataFrame(filtered_rows)
    
    # Deduplicate and sort by rating_number
    deduped = filtered_df.sort_values('rating_number', ascending=False).drop_duplicates('parent_asin')
    top_rows = deduped.sort_values('rating_number', ascending=False)
    top_rows = top_rows[top_rows['movie_title'].notna()]
    
    # Get unique titles
    seen = set()
    unique_top_titles = []
    
    for _, row in top_rows.iterrows():
        norm_title = normalize_title(row['movie_title'])
        if norm_title not in seen:
            seen.add(norm_title)
            unique_top_titles.append((row['movie_title'], row['rating_number']))
        if len(unique_top_titles) >= top_n:
            break

    return unique_top_titles, top_genre

In [465]:
community_id = 6
asin_list = communities[community_id]
top_titles, top_genre = get_top_titles_by_genre_for_community(asin_list, df, asin_to_categories, 10)

print(f"Top {top_genre} movies in community {community_id}:")
for title, rating_count in top_titles:
    print(f"  > {title} (Ratings: {rating_count})")

Top science fiction movies in community 6:
  > Star Trek (Two-Disc Edition) (Ratings: 28422.0)
  > Shrek 2 (Widescreen Edition) (Ratings: 25002.0)
  > Iron Man (Ultimate 2 Disc Edition) (Ratings: 21704.0)
  > Unbreakable (Two-Disc Vista Series) (Ratings: 20912.0)
  > Blade Runner (Four-Disc Collector's Edition) (Ratings: 20471.0)
  > Transformers: Revenge of the Fallen (Ratings: 19375.0)
  > Star Wars: The Original Trilogy (Episodes IV-VI) [Region 2 DVD] (Ratings: 17015.0)
  > Armageddon (Ratings: 16047.0)
  > War of the Worlds (Ratings: 15510.0)
  > The Last Starfighter (Ratings: 12599.0)


Community 0: training style movies. 

Community 1: popular action type movies.

Community 2: More popular action movies, not sure what the difference is with 1.

Community 3: Comedy movies, christmas movies are the most popular

Community 4: Drama stuff

Community 5: More drama stuff.

Community 6: Predominantly Science Fiction adjacent stuff (and shrek 2)