<a href="https://colab.research.google.com/github/sXeSociety/algorithms-massive-data-pagerank/blob/main/notebooks/01_dataset_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Algorithms for Massive Data – Project 3**
*Notebook 01 – Dataset Download & Initial Exploration*


In [None]:
# Change current working directory to the project root
%cd /content/algorithms-massive-data-pagerank

# Import main libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import importlib
from src.utils_io import ensure_dirs
from src.load_data import download_dataset, load_ratings
from src.stats import describe_reviews
from src.preprocessing import build_core_dataset

/content/algorithms-massive-data-pagerank


In [None]:
# Project root is now the current working directory
project_root = os.getcwd()
print("Project root:", project_root)

# Make sure we can import from the src package
if project_root not in sys.path:
    sys.path.append(project_root)

Project root: /content/algorithms-massive-data-pagerank


In [None]:
# Define data directories inside the project
data_dir = os.path.join(project_root, "data")
raw_dir = os.path.join(data_dir, "raw")
processed_dir = os.path.join(data_dir, "processed")

In [None]:
# Create data directories if they do not exist
ensure_dirs([data_dir, raw_dir, processed_dir])

Directory already exists: /content/algorithms-massive-data-pagerank/data
Directory already exists: /content/algorithms-massive-data-pagerank/data/raw
Directory already exists: /content/algorithms-massive-data-pagerank/data/processed


In [None]:
# Define some global variables
use_subsample = True
subsample_fraction = 0.05
seed = 42

In [None]:
# In the version saved on GitHub, keep "xxxxxx" for username and key.
# When running locally on Colab, replace "xxxxxx" with your actual credentials.
os.environ["KAGGLE_USERNAME"] = "xxxxxx"
os.environ["KAGGLE_KEY"] = "xxxxxx"

In [None]:
# Set the Kaggle dataset identifier
kaggle_dataset = "mohamedbakhet/amazon-books-reviews"

In [None]:
# Download the dataset from Kaggle only if it's not already present
download_dataset(raw_dir, kaggle_dataset)

# Load the ratings data, create a subsample and a cleaned version
df_ratings_clean = load_ratings(
    raw_dir=raw_dir,
    processed_dir=processed_dir,
    use_subsample=use_subsample,
    subsample_fraction=subsample_fraction,
    seed=seed,
)

Directory already exists: /content/algorithms-massive-data-pagerank/data/raw
Dataset already present in raw_dir, skipping download.
Directory already exists: /content/algorithms-massive-data-pagerank/data/processed
Using subsample mode.
Full dataset shape before subsample: (3000000, 10)
Subsampled dataset shape: (150000, 10)
Subsample saved to: /content/algorithms-massive-data-pagerank/data/processed/ratings_subsample.csv
Shape df_ratings_clean: (150000, 3)
                user_id     book_id  rating
2945667  A303XPDO694V6X  B0006CR6U4     4.0
2352586  A3780H4TM9RMB8  0897166159     2.0
1531260  A1AX6VPDQQZDPV  0736693408     5.0
941910   A35RQKCCCQ62O0  0395051029     4.0
2582125  A2IJQDE1I4SIJT  4770016050     5.0
Clean subsample saved in: /content/algorithms-massive-data-pagerank/data/processed/ratings_subsample_clean.csv


In [None]:
# Compute and print basic statistics on users, books and reviews
stats_dict = describe_reviews(df_ratings_clean)

Distinct users: 95746
Distinct books: 53940

Reviews per user:
min: 1
median: 1.0
mean: 1.28
max: 290

Reviews per book:
min: 1
median: 1.0
mean: 2.78
max: 339

Users with >= 2 reviews: 11647
Books with >= 2 reviews: 19848


In [None]:
# Build the core dataset to be used for the graph and PageRank
min_reviews = 2

df_core = build_core_dataset(
    df_ratings_clean=df_ratings_clean,
    processed_dir=processed_dir,
    min_reviews=min_reviews,
)

Shape df_core: (30736, 3)
Distinct users in core: 11104
Distinct books in core: 10592
                user_id     book_id  rating
941910   A35RQKCCCQ62O0  0395051029     4.0
2379833  A319KYEIAZ3SON  0670569798     5.0
2911594  A2X86K2EZCV0U1  B000GRDY1O     4.0
2682023  A1JLKPA3EPLFCP  0774032448     5.0
2306673  A1I2O9Y3X3HXLS  B000GRORC4     5.0
Core dataset saved in: /content/algorithms-massive-data-pagerank/data/processed/ratings_core_for_graph.csv
