In [5]:
import os
import requests
import zipfile
import pandas as pd

def download_file(url, output_path):
    """Download file from a URL if it does not exist."""
    if not os.path.exists(output_path):
        print(f"Downloading from {url} ...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(output_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f"Downloaded file to {output_path}.")
    else:
        print(f"File already exists: {output_path}")

def extract_zip(zip_path, extract_to):
    """Extract zip file."""
    if not os.path.exists(extract_to):
        os.makedirs(extract_to, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Extracted {zip_path} to {extract_to}.")

def find_file_in_directory(directory, filename):
    """Search recursively for filename in directory."""
    for root, dirs, files in os.walk(directory):
        if filename in files:
            return os.path.join(root, filename)
    return None

# --- MovieLens 1M Dataset ---
movielens_url = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
movielens_zip_path = "ml-1m.zip"
movielens_extract_dir = "ml-1m"

download_file(movielens_url, movielens_zip_path)
extract_zip(movielens_zip_path, movielens_extract_dir)

ratings_path = find_file_in_directory(movielens_extract_dir, "ratings.dat")
movies_path = find_file_in_directory(movielens_extract_dir, "movies.dat")

if ratings_path is None or movies_path is None:
    raise FileNotFoundError("Could not find ratings.dat or movies.dat")

# Read MovieLens data; ratings use "::" as delimiter.
ratings = pd.read_csv(ratings_path, sep="::", engine='python',
                      names=["UserID", "MovieID", "Rating", "Timestamp"])
# The movies file is latin-1 encoded.
movies = pd.read_csv(movies_path, sep="::", engine='python',
                     names=["MovieID", "Title", "Genres"], encoding='latin-1')

print("=== MovieLens Ratings ===")
print(ratings.head())
print("\n=== MovieLens Movies ===")
print(movies.head())
print("\n=== Ratings Summary ===")
print(ratings.describe())

# --- MIND Dataset ---
# For MIND, downloaded the MIND-small dataset from Kaggle manually:
# Placed the downloaded file (e.g., 'MINDsmall.zip') into your working directory.

mind_zip_path = "MINDsmall_train.zip"
mind_extract_dir = "MINDsmall"

if os.path.exists(mind_zip_path):
    extract_zip(mind_zip_path, mind_extract_dir)
else:
    print("MINDsmall.zip not found. Please download it from Kaggle and place it in the working directory.")

# For MIND, the files are:
behaviors_path = os.path.join(mind_extract_dir, "behaviors.tsv")
news_path = os.path.join(mind_extract_dir, "news.tsv")

if not os.path.exists(behaviors_path):
    behaviors_path = find_file_in_directory(mind_extract_dir, "behaviors.tsv")
if not os.path.exists(news_path):
    news_path = find_file_in_directory(mind_extract_dir, "news.tsv")

if behaviors_path is None or news_path is None:
    raise FileNotFoundError("Could not locate behaviors.tsv or news.tsv in MIND dataset.")

behaviors = pd.read_csv(behaviors_path, sep="\t")
news = pd.read_csv(news_path, sep="\t")

print("\n=== MIND Behaviors ===")
print(behaviors.head())
print("\n=== MIND News ===")
print(news.head())
print("\n=== News Data Summary ===")
print(news.describe(include='all'))


File already exists: ml-1m.zip
Extracted ml-1m.zip to ml-1m.
=== MovieLens Ratings ===
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

=== MovieLens Movies ===
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

=== Ratings Summary ===
             UserID       MovieID        Rating     Timestamp
count  1.000209e+06  1.000209e+06  1.000209e+06  1.000209e+06
mean   3.024512e+03  1.865540e+03  3.581564e+00 