<a href="https://colab.research.google.com/github/sXeSociety/algorithms-massive-data-pagerank/blob/main/notebooks/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Algorithms for Massive Data – Project 3**
*Notebook 01 – Dataset Download, Setup & Initial Exploration*


In [None]:
%cd /content

# Import main libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import importlib

repo_name = "algorithms-massive-data-pagerank"
repo_url = f"https://github.com/sXeSociety/{repo_name}.git"
repo_path = f"/content/{repo_name}"

# Check if repo already exists in /content. If it doesn't, clone the GitHub repo into /content
if not os.path.exists(repo_path):
    print(f"Repository not found in /content. Cloning {repo_name}...")
    !git clone $repo_url /content/{repo_name}
else:
    print(f"Repository already exists at {repo_path}. Skipping clone.")

# Change current working directory to the project root
%cd $repo_path

from src.utils_io import ensure_dirs
from src.load_data import download_dataset, load_ratings
from src.stats import describe_reviews
from src.preprocessing import build_core_dataset, build_core_subset
from src.mapping_ids import build_id_mappings
from src.graph_construction import build_book_cooccurrence_edges

/content
Repository already exists at /content/algorithms-massive-data-pagerank. Skipping clone.
/content/algorithms-massive-data-pagerank


In [None]:
# Project root is now the current working directory
project_root = os.getcwd()
print("Project root:", project_root)

# Make sure we can import from the src package
if project_root not in sys.path:
    sys.path.append(project_root)

Project root: /content/algorithms-massive-data-pagerank


In [None]:
# Define data directories inside the project
data_dir = os.path.join(project_root, "data")
raw_dir = os.path.join(data_dir, "raw")
processed_dir = os.path.join(data_dir, "processed")

In [None]:
# Create data directories if they do not exist
ensure_dirs([data_dir, raw_dir, processed_dir])

Directory already exists: /content/algorithms-massive-data-pagerank/data
Directory already exists: /content/algorithms-massive-data-pagerank/data/raw
Directory already exists: /content/algorithms-massive-data-pagerank/data/processed


In [None]:
# Define some global variables
use_subsample = True
subsample_fraction = 0.05
seed = 42

In [None]:
# In the version saved on GitHub, keep "xxxxxx" for username and key.
# When running locally on Colab, replace "xxxxxx" with your actual credentials.
os.environ["KAGGLE_USERNAME"] = "andreacorradini"
os.environ["KAGGLE_KEY"] = "6ceb10d6edf30adf870d20004e90a5a5"

In [None]:
# Set the Kaggle dataset identifier
kaggle_dataset = "mohamedbakhet/amazon-books-reviews"

In [None]:
# Download the dataset from Kaggle only if it's not already present
download_dataset(raw_dir, kaggle_dataset)

# Load the ratings data, create a subsample and a cleaned version
df_ratings_clean = load_ratings(
    raw_dir=raw_dir,
    processed_dir=processed_dir,
    use_subsample=use_subsample,
    subsample_fraction=subsample_fraction,
    seed=seed,
)

Directory already exists: /content/algorithms-massive-data-pagerank/data/raw
Dataset already present in raw_dir, skipping download.
Directory already exists: /content/algorithms-massive-data-pagerank/data/processed
Found existing cleaned ratings at: /content/algorithms-massive-data-pagerank/data/processed/ratings_subsample_clean.csv
Shape df_ratings_clean (loaded from disk): (150000, 3)
          user_id     book_id  rating
0  A303XPDO694V6X  B0006CR6U4     4.0
1  A3780H4TM9RMB8  0897166159     2.0
2  A1AX6VPDQQZDPV  0736693408     5.0
3  A35RQKCCCQ62O0  0395051029     4.0
4  A2IJQDE1I4SIJT  4770016050     5.0


In [None]:
# Compute and print basic statistics on users, books and reviews
stats_dict = describe_reviews(df_ratings_clean)

Distinct users: 95746
Distinct books: 53940

Reviews per user:
min: 1
median: 1.0
mean: 1.28
max: 290

Reviews per book:
min: 1
median: 1.0
mean: 2.78
max: 339

Users with >= 2 reviews: 11647
Books with >= 2 reviews: 19848


In [None]:
# Build the core dataset to be used for the graph and PageRank
min_reviews = 2

df_core = build_core_dataset(
    df_ratings_clean=df_ratings_clean,
    processed_dir=processed_dir,
    min_reviews=min_reviews,
)

Shape df_core: (30736, 3)
Distinct users in core: 11104
Distinct books in core: 10592
           user_id     book_id  rating
3   A35RQKCCCQ62O0  0395051029     4.0
13  A319KYEIAZ3SON  0670569798     5.0
16  A2X86K2EZCV0U1  B000GRDY1O     4.0
19  A1JLKPA3EPLFCP  0774032448     5.0
32  A1I2O9Y3X3HXLS  B000GRORC4     5.0
Core dataset saved in: /content/algorithms-massive-data-pagerank/data/processed/ratings_core_for_graph.csv


*Notebook 02 – Graph construction*


In [None]:
# Load df_core
df_core = pd.read_csv(os.path.join(processed_dir, "ratings_core_for_graph.csv"))

# Build the small dataset
df_core_small = build_core_subset(
    df_core=df_core,
    processed_dir=processed_dir,
    max_users=2000,
    save_name="ratings_core_small_for_graph.csv"
)


[build_core_subset] Limiting to first 2000 users.

[build_core_subset] Subset stats after filtering
Subset ratings: 5875
Subset distinct users: 2000
           user_id     book_id  rating
3   A1JLKPA3EPLFCP  0774032448     5.0
4   A1I2O9Y3X3HXLS  B000GRORC4     5.0
11  A11J17A2HNP5FQ  B0006EAC1C     4.0
12  A16R2I6AGL4NQW  1578210615     5.0
14  A1F8GH7CR68P59  B0007DWEOU     4.0
Core subset dataset saved in: /content/algorithms-massive-data-pagerank/data/processed/ratings_core_small_for_graph.csv


In [None]:
# Create integer index mappings for users and books
user_mapping, book_mapping, df_indexed = build_id_mappings(
    df_core_small=df_core_small,
    processed_dir=processed_dir,
    user_mapping_name="user_id_mapping_small.csv",
    book_mapping_name="book_id_mapping_small.csv",
    ratings_indexed_name="ratings_core_small_indexed.csv",
)

Directory already exists: /content/algorithms-massive-data-pagerank/data/processed

[build_id_mappings] Creating user/book integer index mappings...
[build_id_mappings] Saved user mapping:   /content/algorithms-massive-data-pagerank/data/processed/user_id_mapping_small.csv
[build_id_mappings] Saved book mapping:   /content/algorithms-massive-data-pagerank/data/processed/book_id_mapping_small.csv
[build_id_mappings] Saved indexed ratings: /content/algorithms-massive-data-pagerank/data/processed/ratings_core_small_indexed.csv


In [None]:
# Build the book co-occurrence edge list from the indexed small core dataset
edges_df = build_book_cooccurrence_edges(
    df_indexed=df_indexed,
    processed_dir=processed_dir,
    save_name="edges_books_core_small.csv",
    max_books_per_user=50,
    min_weight=1,
)

edges_df["weight"].describe()
edges_df["weight"].value_counts().head(10)

Directory already exists: /content/algorithms-massive-data-pagerank/data/processed

[build_book_cooccurrence_edges] Building book co-occurrence graph...
[build_book_cooccurrence_edges] Number of distinct edges: 13367

[build_book_cooccurrence_edges] Edge list (first rows):
   src_book_idx  dst_book_idx  weight
0          3338          3602       3
1          2599          3243       3
2          2605          3558       1
3          1378          3504       2
4          1798          3488       1
[build_book_cooccurrence_edges] Edge list saved in: /content/algorithms-massive-data-pagerank/data/processed/edges_books_core_small.csv


Unnamed: 0_level_0,count
weight,Unnamed: 1_level_1
1,13230
2,110
3,21
4,5
5,1
