In [1]:
# Standard library imports
import os
import gzip
import json
import ast
import re
from collections import defaultdict, Counter

# Data manipulation and numerical operations
import numpy as np
import pandas as pd

# Parallel processing and progress tracking
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

# Natural Language Processing
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Graph and network analysis
import networkx as nx
from networkx.readwrite import json_graph
import netwulf as nw
import community as community_louvain  # Louvain algorithm for community detection

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK data
nltk.download('vader_lexicon')

# Set random seed for reproducibility (Louvain algorithm)
np.random.seed(seed=100)  # Does it work???


  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Friis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Part 1
Motivation


# Part 2

Our dataset is a subset of the "Amazon Reviews" dataset collected in 2023 by McAuley Lab:

Link to dataset: https://amazon-reviews-2023.github.io/

We are not interested in the entire 571M+ reviews but will be looking specifically at the "Movies_and_TV" subset. Each subset is divided into two .csv files, a review and a metadata.

The review contains - as the name would suggest - the reviews themselves. With zero cleaning, the combined dataset is around 7GB. Almost immediately there are large portions of the dataset we deem to not be relevant data and so we will be shrinking the usable dataset, to get a much cleaner and more potent dataset for the purposes of creating the co-reviewer graph. 

Firstly, an overview of the columns in each dataset:

| Reviews             | Description                                                                 |
|--------------------|-----------------------------------------------------------------------------|
| `rating`           | Star rating given by the reviewer (e.g., 1 to 5)                            |
| `title`            | Title or headline of the review                                             |
| `text`             | The main body text of the review                                            |
| `images`           | Image URLs or attachments included with the review (if any)                 |
| `asin`             | Amazon Standard Identification Number for the specific product              |
| `parent_asin`      | Group identifier to cluster product variants                                |
| `user_id`          | Anonymized unique identifier for the reviewer                               |
| `timestamp`        | Time the review was posted (UNIX or ISO format)                             |
| `helpful_vote`     | Number of people who found the review helpful                               |
| `verified_purchase`| Boolean indicating whether the purchase was verified by Amazon             |

| Metadata           | Description                                                                 |
|------------------|-----------------------------------------------------------------------------|
| `main_category`  | The top-level category the product belongs to (e.g., "Movies & TV")         |
| `title`          | The main product title (often the movie/show title)                         |
| `subtitle`       | Optional subtitle for the product (e.g., edition/version info)              |
| `average_rating` | The average customer rating for the product                                 |
| `rating_number`  | Total number of ratings the product has received                            |
| `features`       | A list of product features (e.g., language, format)                         |
| `description`    | A longer description or summary of the product                              |
| `price`          | The listed price of the product                                              |
| `images`         | A list of image URLs associated with the product                            |
| `videos`         | Video media links (e.g., trailers or previews), if available                |
| `store`          | The Amazon store/subcategory under which the product is listed              |
| `categories`     | List of categories/tags assigned to the product (e.g., genre, theme)        |
| `details`        | Additional technical or marketing metadata (format, region, etc.)           |
| `parent_asin`    | A group-level identifier for versions of the same product                   |


A couple of these drastically reduce the size like, `helpful_vote` and `verified_purchase`. We decided to remove these because reviews without a helpful vote proably isn't very high quality, or irrelevant. Similarly, if the purchase can't be verified we can't be certain that the review comes from a real user. Reviews with less than 10 words were also deemed to be of lesser quality; The goal is to find descriptive reviews to facilitate language processing. 

Lastly, we removed products that had less than 15 total reviews. These would needlessly bloat our dataset, and make it more noisy. A large portion of the dataset contains products with minimal engagement, these would not facilitate community detection well. We summarized the compression in the table:

| Step                                                  | Description                                                  | Count / Shape          |
|-------------------------------------------------------|--------------------------------------------------------------|------------------------|
| **Initial raw dataset**                               | Total number of reviews loaded                               | 17,328,314             |
|                                                       | Initial dataframe shape                                      | (17,328,314, 10)       |
|                                                       | Unique users (`user_id`)                                     | 6,503,429              |
|                                                       | Unique `parent_asin` values                                  | 747,764                |
| **After filtering reviews with `helpful_vote ≥ 1`**   | Removed unhelpful or unused reviews                          | (4,325,435, 10)        |
| **After removing short reviews (< 10 words)**         | Kept only meaningful reviews                                 | (3,795,557, 10)        |
| **After keeping only `verified_purchase == True`**    | Removed potentially fake/unreliable reviews                  | (2,428,871, 10)        |
| **Total words in cleaned dataset**                    | Word count of all remaining reviews                          | 196,353,406            |
| **After removing products with < 15 reviews**         | Ensured statistical validity of products                     | (1,341,856, 10)        |
|                                                       | Unique ASINs after filtering                                 | 34,333                 |

Thus we are left with 34,333 unique Movies/Shows (Rows). Next, we need to decide how many features to use. As can be seen from the feature tables above - many of the columns are redundant information we wont be needing. After the initial removal of rows, the metadata .csv is ASIN matched. 

Then, we removed non-essential or redundant columns such as `verified_purchase`, `subtitle`, `images_x`, `features`, `images_y`, `videos`, `store`, `details`, `bought_together`, and `author`. After this cleanup, we're left with the following columns in our final merged dataset: `rating`, `review_title`, `text`, `asin`, `parent_asin`, `user_id`, `timestamp`, `helpful_vote`, `main_category`, `movie_title`, `average_rating`, `rating_number`, `description`, `price`, and `categories`.

We have also added sentiment scores for each review using Vader NLTK.

The final dataset has the shape (1341856, 17) (with sentiment scores). Taking up ~0.75GB

In [None]:
# Download link: https://amazon-reviews-2023.github.io/

# REVIEWS DATASET:::

input_path = "Movies_and_TV.jsonl.gz"
# Give the output CSV a name:
output_path = "Movies_and_TV_Full.csv" #5.8GB file

# Check if the output file already exists
if os.path.exists(output_path):
    print(f"The file '{output_path}' already exists.")
else:
    # Depends on available RAM, if set too high can cause crashing.
    chunk_size = 150000
    buffer = []
    is_first_chunk = True

    with gzip.open(input_path, 'rt', encoding='utf-8') as f:
        total_lines = sum(1 for line in f)  # Count total lines for progress bar

    with gzip.open(input_path, 'rt', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f, 1), total=total_lines, desc="Loading reviews", unit="line"):
            buffer.append(json.loads(line))

            if i % chunk_size == 0:
                df_chunk = pd.DataFrame(buffer)
                df_chunk.to_csv(output_path, mode='w' if is_first_chunk else 'a',
                                header=is_first_chunk, index=False)
                buffer = []
                is_first_chunk = False

    # Final chunk
    if buffer:
        df_chunk = pd.DataFrame(buffer)
        df_chunk.to_csv(output_path, mode='w' if is_first_chunk else 'a',
                        header=is_first_chunk, index=False)
        print("Reviews conversion finished")

# METADATA DATASET:::

meta_input_path = "meta_Movies_and_TV.jsonl.gz"
meta_output_path = "meta_Movies_and_TV_Full.csv"

# Skip conversion if file already exists
if os.path.exists(meta_output_path):
    print(f"The file '{meta_output_path}' already exists.")
else:
    chunk_size = 150000
    buffer = []
    is_first_chunk = True

    # Count total lines for progress bar
    with gzip.open(meta_input_path, 'rt', encoding='utf-8') as f:
        total_lines = sum(1 for _ in f)

    with gzip.open(meta_input_path, 'rt', encoding='utf-8') as f:
        for i, line in tqdm(enumerate(f, 1), total=total_lines, desc="Converting metadata", unit="line"):
            buffer.append(json.loads(line))

            if i % chunk_size == 0:
                df_chunk = pd.DataFrame(buffer)
                df_chunk.to_csv(meta_output_path, mode='w' if is_first_chunk else 'a',
                                header=is_first_chunk, index=False)
                buffer = []
                is_first_chunk = False

    # Final chunk
    if buffer:
        df_chunk = pd.DataFrame(buffer)
        df_chunk.to_csv(meta_output_path, mode='w' if is_first_chunk else 'a',
                        header=is_first_chunk, index=False)

    print("Metadata conversion finished.")

In [None]:
# After saving lets load the CSV and look at summary statistics:

# Load the CSV
df = pd.read_csv("Movies_and_TV_Full.csv")
# Dataset dimensions
print(f"Total reviews: {len(df):,}")
print(f"Dataframe shape before clean: {df.shape}")
print()

# Check uniqueness
print(f"- Unique users (user_id): {df['user_id'].nunique():,}")
print(f"- Unique products (asin): {df['asin'].nunique():,}")

if 'parent_asin' in df.columns:
    print(f"- Unique parent_asin: {df['parent_asin'].nunique():,}")

In [None]:
# Here we clean the entire dataset before merging with metadata:

review_length = 10
review_threshold = 15 #greatly varies the size of the final network
helpful_threshold = 1

# Remove rows where 'helpful_vote' is 0
df_cleaned = df[df['helpful_vote'] >= helpful_threshold]
print(f"Shape after removing unhelpful: {df_cleaned.shape}")

# Remove rows with less tan `review_length` words in 'text'
df_cleaned = df_cleaned[df_cleaned['text'].apply(lambda x: len(str(x).split()) >= review_length)]
print(f"Shape after removing sub {review_length} word reviews: {df_cleaned.shape}")

# Remove rows where 'verified_purchase' is False
df_cleaned = df_cleaned[df_cleaned['verified_purchase'] == True]
print(f"Shape after removing unverified purchases: {df_cleaned.shape}")

# Calculate the total number of words in the 'text' column after cleaning
total_words = df_cleaned['text'].apply(lambda x: len(str(x).split())).sum()
print(f"Total number of words in the dataset after cleaning: {total_words:,}")

# Count reviews per ASIN
asin_review_counts = df_cleaned['asin'].value_counts()

# Filter ASINs based on the review threshold
valid_asins = asin_review_counts[asin_review_counts >= review_threshold].index

# Keep only rows where the ASIN is in the list of valid ASINs
df_cleaned = df_cleaned[df_cleaned['asin'].isin(valid_asins)]
print(f"Shape after filtering products with less than {review_threshold} reviews: {df_cleaned.shape}")

print(f"Unique ASINs after filtering: {df_cleaned['asin'].nunique()}")

In [None]:
# Datasets are merged:

df_meta = pd.read_csv("meta_Movies_and_TV_Full.csv")
df_merged = pd.merge(df_cleaned, df_meta, left_on='parent_asin', right_on='parent_asin', how='left')
print("Merged shape (before dropping):", df_merged.shape)
#Drop columns we don't need at the moment
columns_to_drop = [
    'verified_purchase', 'subtitle', 'images_x', 'features', 'images_y',
    'videos', 'store', 'details', 'bought_together', 'author'
]

df_merged.drop(columns=columns_to_drop, inplace=True, errors='ignore')

df_merged.rename(columns={
    'title_x': 'review_title',    
    'title_y': 'movie_title'
}, inplace=True)

print("Merge Complete:")
print("Merged shape:", df_merged.shape)
df_merged.to_csv("Merged_Reviews_and_Metadata.csv", index=False)

In [None]:
# Now, we want to add sentiment data with NLTK:

threads = 8
neutral_threshold = 0.1  # Bin around 0 that indicate a neutral sentiment

# Function to process a batch of reviews
def analyze_batch_sentiment(batch):
    sia = SentimentIntensityAnalyzer()
    return [sia.polarity_scores(str(text))['compound'] for text in batch]

# More is faster but uses more resources
batch_size = 10000

# Batch Generator
batches = [df_merged['text'][i:i + batch_size] for i in range(0, len(df_merged), batch_size)]

with tqdm_joblib(desc="Sentiment Analysis", total=len(batches)):
    sentiment_scores_batches = Parallel(n_jobs=threads)(
        delayed(analyze_batch_sentiment)(batch) for batch in batches
    )

# Flatten the list of batches into a single list
sentiment_scores = [score for batch_scores in sentiment_scores_batches for score in batch_scores]

# Add sentiment score and category to dataframe
df_merged['sentiment_score'] = sentiment_scores
df_merged['sentiment_category'] = df_merged['sentiment_score'].apply(
    lambda x: 'positive' if x > neutral_threshold else ('negative' if x < -neutral_threshold else 'neutral')
)

In [None]:
# Save the cleaned dataset with sentiment:
output_path = "Movies_and_TV_Cleaned_Sentiment.csv"
df_merged.to_csv(output_path, index=False)

Next up is our graph. The choice of graph is what we call a "Co-Reviews" graph. The co reviews refers to the fact that an edge is formed when two products have been reviewed by the same person. So, if movie X and movie Y has been reviewed by some user, they receive an edge weight of +1 between them. In this way, we create a graph where each node is a Movie/Show (essentially an ASIN product code), each product has at least 16 reviews attatched to them and this is where the text analysis comes into play. 

Therefore, our graph connects movies together that communities enjoy watching - which is one of our goals of this project. The immediate issue with this approach is that some people are very prolific where the vast majority of people write few reviews. Therefore a small amount of single individuals can baloon the edge count making the graph noisy. We therefore prune all edges with weight less than 2, to make sure two or more people have reviewed each pair of ASIN's in order for the node to survive in our final graph.

This leaves us with; Node count: 20711 and edge count: 91728.


In [None]:
# Here we use an optimized algorithm to compute the graph in batches
# Even with this it takes 30+ minutes on a 6 core processor. 

# 1. Load and preprocess data efficiently
print("Loading data...")
df_graph = df_merged[['user_id', 'parent_asin']].dropna().drop_duplicates()

# 2. Create bidirectional mappings
print("Creating mappings...")
# ASIN to reviewers
asin_to_reviewers = defaultdict(set)
# Reviewer to ASINs
reviewer_to_asins = defaultdict(set)

for _, row in tqdm(df_graph.iterrows(), total=len(df_graph)):
    asin_to_reviewers[row['parent_asin']].add(row['user_id'])
    reviewer_to_asins[row['user_id']].add(row['parent_asin'])

# 3. Create graph and add nodes
G = nx.Graph()
G.add_nodes_from(asin_to_reviewers.keys())

# 4. Parallel edge computation
def compute_edges_for_asin(asin, asin_to_reviewers, all_asins):
    edges = []
    # Get all other ASINs that share at least one reviewer
    co_reviewed_asins = set()
    for reviewer in asin_to_reviewers[asin]:
        co_reviewed_asins.update(reviewer_to_asins[reviewer])
    
    # Remove self and already processed pairs
    co_reviewed_asins.discard(asin)
    co_reviewed_asins = [a for a in co_reviewed_asins if all_asins.index(a) > all_asins.index(asin)]
    
    for other_asin in co_reviewed_asins:
        weight = len(asin_to_reviewers[asin] & asin_to_reviewers[other_asin])
        if weight > 0:
            edges.append((asin, other_asin, weight))
    return edges

print("Computing edges in parallel...")
all_asins = list(asin_to_reviewers.keys())
results = Parallel(n_jobs=-1, prefer="threads")(
    delayed(compute_edges_for_asin)(asin, asin_to_reviewers, all_asins)
    for asin in tqdm(all_asins)
)

# 5. Add edges to graph
print("Building graph...")
for edges in tqdm(results):
    for asin1, asin2, weight in edges:
        G.add_edge(asin1, asin2, weight=weight)

print(f"Graph constructed with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

In [None]:
# Save the graph as a json file
graph_data = json_graph.node_link_data(G)
with open("network.json", "w") as f:
    json.dump(graph_data, f, indent = 4)

In [None]:
# Code to open up the graph if needed:
with open("network.json", "r") as f:
    graph_data = json.load(f)

G = json_graph.node_link_graph(graph_data)

In [None]:
# Lets clean up the graph:
MIN_SHARED_REVIEWERS = 2 

G_filtered = nx.Graph()
for u, v, data in G.edges(data=True):
    if data['weight'] >= MIN_SHARED_REVIEWERS:
        G_filtered.add_edge(u, v, weight=data['weight'])

print(f"Filtered Graph: {G_filtered.number_of_nodes()} nodes, {G_filtered.number_of_edges()} edges")

In [None]:
# Save the final graph with pruned edges:
graph_data = json_graph.node_link_data(G_filtered)
with open("network_filtered.json", "w") as f:
    json.dump(graph_data, f, indent = 4)

# Part III

The following is the graph statistics used in the project. Here we show how all the plots work, and how we calculated the coefficients and network measures:

In [None]:
# Load the dataframe for the plots:
df = pd.read_csv("Movies_and_TV_Cleaned_Sentiment.csv")

In [None]:
# Here we plot the user and product distributions: 

# Compute reviews per product
asin_review_counts = df['asin'].value_counts()

# Compute reviews per user
user_review_counts = df['user_id'].value_counts()
filtered_user_counts = user_review_counts[user_review_counts <= 10000]

# Plot styling
sns.set(style="whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)

# --- Plot 1: Reviews per Product ---
sns.histplot(
    asin_review_counts,
    bins=50,
    binrange=(0, asin_review_counts.max()),
    color="#4C72B0",
    edgecolor='white',
    ax=axes[0]
)
axes[0].set_yscale("log")
axes[0].set_title("Reviews per Product", fontsize=14)
axes[0].set_xlabel("Number of Reviews", fontsize=12)
axes[0].set_ylabel("Number of Products (log scale)", fontsize=12)
axes[0].grid(True, linestyle='--', linewidth=0.5)

# --- Plot 2: Reviews per User (Filtered) ---
sns.histplot(
    filtered_user_counts,
    bins=50,
    color="#4C72B0",
    edgecolor='white',
    ax=axes[1]
)
axes[1].set_yscale("log")
axes[1].set_title("Reviews per User", fontsize=14)
axes[1].set_xlabel("Number of Reviews by User", fontsize=12)
axes[1].set_ylabel("")  # Shared y-axis already labeled
axes[1].grid(True, linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()

In [None]:
# Plot made to resemble the style in the course notes :::
# This plot shows us the degree distribution with a comparison to a random graph

G_real = G_filtered

# --- Step 2: Generate a random graph for comparison
n = G_real.number_of_nodes()
p = (2 * G_real.number_of_edges()) / (n * (n - 1))  # match average degree
G_random = nx.erdos_renyi_graph(n, p)

# --- Step 3: Get degree distributions
def degree_distribution(graph):
    degrees = [d for n, d in graph.degree()]
    hist = np.bincount(degrees)
    pk = hist / sum(hist)
    return np.nonzero(pk)[0], pk[pk > 0]  # degrees, probabilities

deg_real, pk_real = degree_distribution(G_real)
deg_rand, pk_rand = degree_distribution(G_random)

# --- Step 4: Plotting
plt.figure(figsize=(10, 6))
plt.loglog(deg_rand, pk_rand, 'o-', color='red', label="Random Network")
plt.loglog(deg_real, pk_real, 'o-', color='blue', label="Amazon Network")

# --- Step 5: Annotate with average/median degree
avg_deg_real = np.mean([d for _, d in G_real.degree()])
med_deg_real = np.median([d for _, d in G_real.degree()])
avg_deg_rand = np.mean([d for _, d in G_random.degree()])

# Add vertical lines
plt.axvline(avg_deg_rand, linestyle="--", color="yellow", label=f"Avg Degree (Random): {avg_deg_rand:.2f}")
plt.axvline(avg_deg_real, linestyle="--", color="teal", label=f"Avg Degree (Amazon Network): {avg_deg_real:.2f}")
plt.axvline(med_deg_real, linestyle="--", color="black", label=f"Median Degree (Amazon Network): {med_deg_real:.0f}")

# Labels and legend
plt.xlabel("Degree k (node)")
plt.ylabel("Probability p(k) of degree k")
plt.title("Degree Distribution of Amazon Reviews Network vs. Random Network")
plt.legend()
plt.grid(True, which="both", linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

In [None]:
# Code morphed from previous assignments:
# Calculating the clustering coefficients
# This shows clear evidence of the small world property!

# Step 1: Extract giant component of real network
largest_cc = max(nx.connected_components(G_filtered), key=len)
G_giant = G_filtered.subgraph(largest_cc).copy()

# Step 2: Generate a random network with same size and edge count
n = G_giant.number_of_nodes()
m = G_giant.number_of_edges()
p = (2 * m) / (n * (n - 1))
G_random = nx.erdos_renyi_graph(n, p)

# Step 3: Approximate average path length (sampling)
def approximate_avg_path_length(graph, samples=10000):
    nodes = list(graph.nodes())
    total_length = 0
    valid_samples = 0
    for _ in range(samples):
        u, v = np.random.choice(nodes, 2, replace=False)
        try:
            length = nx.shortest_path_length(graph, source=u, target=v)
            total_length += length
            valid_samples += 1
        except nx.NetworkXNoPath:
            continue
    return total_length / valid_samples if valid_samples > 0 else float('inf')

# Step 4: Compute metrics
real_path = approximate_avg_path_length(G_giant)
rand_path = approximate_avg_path_length(G_random)
real_clustering = nx.average_clustering(G_giant)
rand_clustering = nx.average_clustering(G_random)

# Step 5: Print comparison
print("=== Small-World Property Comparison ===")
print(f"Approx. Avg Path Length (Amazon Network): {real_path:.4f}")
print(f"Approx. Avg Path Length (Random Network): {rand_path:.4f}")
print()
print(f"Clustering Coefficient (Amazon Network):  {real_clustering:.4f}")
print(f"Clustering Coefficient (Random Network):  {rand_clustering:.4f}")

In [None]:
n = G_filtered.number_of_nodes()
m = G_filtered.number_of_edges()

p_actual = (2 * m) / (n * (n - 1))
p_critical = 1 / n
p_connected = np.log(n) / n

print("=== Connectivity Analysis ===")
print(f"Number of nodes (n):                    {n}")
print(f"Number of edges (m):                    {m}")
print(f"Actual edge probability (p):            {p_actual:.6f}")
print(f"Critical threshold (1/n):               {p_critical:.6f}")
print(f"Connected threshold (log(n)/n):         {p_connected:.6f}")
print()
print(f"Giant Component Exists?   {p_actual > p_critical}")
print(f"Graph Likely Fully Connected?   {p_actual > p_connected}")

To summarize, it's clear that the network falls into the "supercritical" regime. There is clear evidence of the small world property, as is expected with natural networks. Additionally, we demonstrate how it differs from the random network that lacks the same clustering coefficient. 