<a href="https://colab.research.google.com/github/sXeSociety/algorithms-massive-data-pagerank/blob/main/notebooks/01_dataset_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Algorithms for Massive Data – Project 3**
*Notebook 01 – Dataset Download & Initial Exploration*


In [27]:
# Import main libraries
import pandas as pd
import os
import numpy as np

In [28]:
# Replicate repo structure in Colab
data_dir = "/content/data"
raw_dir = os.path.join(data_dir, "raw")
processed_dir = os.path.join(data_dir, "processed")

In [29]:
# Create all three directories if they do not exist yet
for d in [data_dir, raw_dir, processed_dir]:
  if not os.path.exists(d):
    os.makedirs(d)
    print(f"Created directory: {d}")
  else:
    print(f"Directory already exists: {d}")

Directory already exists: /content/data
Directory already exists: /content/data/raw
Directory already exists: /content/data/processed


In [30]:
# Define some global variables
use_subsample = True
subsample_fraction = 0.01
seed = 42

In [31]:
# Print recap
print("\nSetup Recap")
print(f"use_subsample = {use_subsample}")
print(f"raw_dir       = {raw_dir}")
print(f"processed_dir = {processed_dir}")


Setup Recap
use_subsample = True
raw_dir       = /content/data/raw
processed_dir = /content/data/processed


In [32]:
# In the version saved on GitHub, keep "xxxxxx" for username and key.
# When running locally on Colab, replace "xxxxxx" with your actual credentials.
os.environ["KAGGLE_USERNAME"] = "andreacorradini"
os.environ["KAGGLE_KEY"] = "f322b27e98605cdc6e73672fada0d561"

In [33]:
# Set the Kaggle dataset identifier
kaggle_dataset = "mohamedbakhet/amazon-books-reviews"

In [34]:
# Check if the dataset already exists in RAW_DIR
existing_files = []
for f in os.listdir(raw_dir):
    if f.lower().endswith(".csv"):
        existing_files.append(f)

if existing_files:
  print("Dataset already present in raw_dir, skipping download.")
else:
  print("Dataset not found in raw_dir, downloading from Kaggle.")
  !kaggle datasets download -d {kaggle_dataset} -p {raw_dir} --unzip
  print("Download extraction completed.")
  print("Files now in raw_dir:")
  for f in os.listdir(raw_dir):
    print(" -", f)

Dataset already present in raw_dir, skipping download.


In [35]:
ratings_path = os.path.join(raw_dir, "Books_rating.csv")
np.random.seed(seed)

# Does the file exist?
if not os.path.exists(ratings_path):
  print(f"ERROR: ratings file not found at {ratings_path}")

else:
  if use_subsample:
    print("Using subsample mode")
    # Charge the whole file
    df_ratings = pd.read_csv(ratings_path)
    print(f"Full dataset shape before subsample: {df_ratings.shape}")
    # Fractional sample
    df_ratings = df_ratings.sample(
        frac = subsample_fraction,
        random_state = seed)
    print(f"Subsampled dataset shape: {df_ratings.shape}")
    # Save subsample in processed/
    subsample_path = os.path.join(processed_dir, "ratings_subsample.csv")
    df_ratings.to_csv(subsample_path, index = False)
    print(f"Subsample saved to: {subsample_path}")

  else:
     print("Loading full dataset (no subsample)...")
     df_ratings = pd.read_csv(ratings_path)
     print(f"Full dataset shape: {df_ratings.shape}")

  # Some diagnostic informations
  print("Final dataframe shape:", df_ratings.shape)
  print(df_ratings.columns)
  print(df_ratings.head())

Using subsample mode
Full dataset shape before subsample: (3000000, 10)
Subsampled dataset shape: (30000, 10)
Subsample saved to: /content/data/processed/ratings_subsample.csv
Final dataframe shape: (30000, 10)
Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')
                 Id                                              Title  Price  \
2945667  B0006CR6U4  A dictionary of the Targumim, the Talmud Babli...    NaN   
2352586  0897166159           Espresso Coffee: Professional Techniques    NaN   
1531260  0736693408  The First King of Shannara (The Sword of Shann...    NaN   
941910   0395051029             Wuthering Heights (Riverside editions)    NaN   
2582125  4770016050  A Cat, a Man, and Two Women (Japans Modern Wri...    NaN   

                User_id                 profileName review/helpfulness  \
2945667  A303XPDO694V6X                       Ariel    