In [1]:
# -*- coding: utf-8 -*-
# Book-Crossing + Preprocessed book side-info → Artifacts (PKL/H5/YAML/JSON)
# INPUT (your exact paths):
#   C:\Users\sagni\Downloads\archive\Books Data with Category Language and Summary\Preprocessed_data.csv
#   C:\Users\sagni\Downloads\archive\Book reviews\Book reviews\BX_Books.csv
#   C:\Users\sagni\Downloads\archive\Book reviews\Book reviews\BX-Book-Ratings.csv
#   C:\Users\sagni\Downloads\archive\Book reviews\Book reviews\BX-Users.csv
#
# OUTPUT (artifacts):
#   C:\Users\sagni\Downloads\AI Book Recommender with Mood Detection\bx_artifacts.pkl
#   C:\Users\sagni\Downloads\AI Book Recommender with Mood Detection\bx_interactions.h5
#   C:\Users\sagni\Downloads\AI Book Recommender with Mood Detection\bx_config.yaml
#   C:\Users\sagni\Downloads\AI Book Recommender with Mood Detection\bx_summary.json

import os, sys, json, pickle, time, platform, re, warnings
from pathlib import Path

import numpy as np
import pandas as pd

import h5py
import yaml
from scipy.sparse import csr_matrix

# -----------------------
# Paths (edit if needed)
# -----------------------
BASE_DIR      = r"C:\Users\sagni\Downloads\AI Book Recommender with Mood Detection"
PP_PATH       = r"C:\Users\sagni\Downloads\archive\Books Data with Category Language and Summary\Preprocessed_data.csv"
BX_BOOKS      = r"C:\Users\sagni\Downloads\archive\Book reviews\Book reviews\BX_Books.csv"
BX_RATINGS    = r"C:\Users\sagni\Downloads\archive\Book reviews\Book reviews\BX-Book-Ratings.csv"
BX_USERS      = r"C:\Users\sagni\Downloads\archive\Book reviews\Book reviews\BX-Users.csv"

PKL_OUT       = str(Path(BASE_DIR) / "bx_artifacts.pkl")
H5_OUT        = str(Path(BASE_DIR) / "bx_interactions.h5")
YAML_OUT      = str(Path(BASE_DIR) / "bx_config.yaml")
JSON_OUT      = str(Path(BASE_DIR) / "bx_summary.json")

Path(BASE_DIR).mkdir(parents=True, exist_ok=True)

# -----------------------
# Helpers
# -----------------------
def read_csv_robust(path, **kw):
    """
    Try common encodings for BX_* files, which often need latin-1.
    """
    encodings = ["latin-1", "cp1252", "utf-8", "utf-8-sig"]
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, **kw)
        except Exception as e:
            last_err = e
    raise last_err

def norm_isbn(s):
    """
    Normalize ISBN as string: strip, remove spaces, keep leading zeros.
    """
    if pd.isna(s):
        return ""
    s = str(s).strip()
    s = s.replace(" ", "")
    return s

def clean_text(s):
    if pd.isna(s):
        return ""
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return s

# -----------------------
# Load BX datasets
# -----------------------
if not Path(BX_BOOKS).exists() or not Path(BX_RATINGS).exists() or not Path(BX_USERS).exists():
    raise SystemExit("Missing BX files. Check BX_BOOKS, BX_RATINGS, BX_USERS paths.")

print("[INFO] Loading BX files...")
books = read_csv_robust(BX_BOOKS, sep=";", on_bad_lines="skip", low_memory=False)
ratings = read_csv_robust(BX_RATINGS, sep=";", on_bad_lines="skip", low_memory=False)
users = read_csv_robust(BX_USERS, sep=";", on_bad_lines="skip", low_memory=False)

# Standardize BX columns
# Common BX schema:
# - BX-Books: ISBN; Book-Title; Book-Author; Year-Of-Publication; Publisher; Image-URL-S; Image-URL-M; Image-URL-L
# - BX-Book-Ratings: User-ID; ISBN; Book-Rating (0-10; 0 = implicit)
# - BX-Users: User-ID; Location; Age
books.columns = [c.strip() for c in books.columns]
ratings.columns = [c.strip() for c in ratings.columns]
users.columns = [c.strip() for c in users.columns]

# Keep important columns and clean
books["ISBN"] = books["ISBN"].map(norm_isbn) if "ISBN" in books.columns else ""
if "Book-Title" in books.columns:
    books["Book-Title"] = books["Book-Title"].map(clean_text)
else:
    books["Book-Title"] = ""

if "Book-Author" in books.columns:
    books["Book-Author"] = books["Book-Author"].map(clean_text)
else:
    books["Book-Author"] = ""

if "Year-Of-Publication" in books.columns:
    # sometimes messy; coerce numeric
    books["Year-Of-Publication"] = pd.to_numeric(books["Year-Of-Publication"], errors="coerce")

# Ratings: coerce types
if "User-ID" not in ratings.columns or "ISBN" not in ratings.columns or "Book-Rating" not in ratings.columns:
    raise SystemExit("BX-Book-Ratings.csv is missing required columns (User-ID, ISBN, Book-Rating).")

ratings["User-ID"] = ratings["User-ID"].astype(str)
ratings["ISBN"] = ratings["ISBN"].map(norm_isbn)
ratings["Book-Rating"] = pd.to_numeric(ratings["Book-Rating"], errors="coerce").fillna(0).astype(int)

# Users
if "User-ID" in users.columns:
    users["User-ID"] = users["User-ID"].astype(str)
else:
    users["User-ID"] = ""

# -----------------------
# Load Preprocessed side-info
# -----------------------
if not Path(PP_PATH).exists():
    warnings.warn(f"Preprocessed_data.csv NOT found at: {PP_PATH}. Proceeding without side-info.")
    pp = pd.DataFrame(columns=["ISBN","Title","Category","Language","Summary"])
else:
    print("[INFO] Loading preprocessed side-info...")
    # this file is usually UTF-8/UTF-16; try robustly
    try:
        pp = read_csv_robust(PP_PATH, low_memory=False)
    except Exception:
        # last resort: sep=',' default
        pp = pd.read_csv(PP_PATH, low_memory=False)

    # Normalize likely columns
    pp_cols = {c.lower().strip(): c for c in pp.columns}
    # Try to identify columns
    isbn_col = None
    for k in pp_cols:
        if k in ("isbn","book_isbn"):
            isbn_col = pp_cols[k]; break
    title_col = None
    for k in pp_cols:
        if k in ("title","book_title","book-name","name"):
            title_col = pp_cols[k]; break
    cat_col = None
    for k in pp_cols:
        if "category" in k:
            cat_col = pp_cols[k]; break
    lang_col = None
    for k in pp_cols:
        if "lang" in k:
            lang_col = pp_cols[k]; break
    sum_col = None
    for k in pp_cols:
        if "summary" in k or "desc" in k or "synopsis" in k:
            sum_col = pp_cols[k]; break

    # Build a clean side-info frame
    pp2 = pd.DataFrame()
    if isbn_col:
        pp2["ISBN"] = pp[isbn_col].map(norm_isbn)
    else:
        pp2["ISBN"] = ""  # we will fallback join on title

    if title_col:
        pp2["Title_pp"] = pp[title_col].map(clean_text)
    else:
        pp2["Title_pp"] = ""

    pp2["Category"] = pp[cat_col].map(clean_text) if cat_col else ""
    pp2["Language"] = pp[lang_col].map(clean_text) if lang_col else ""
    pp2["Summary"]  = pp[sum_col].map(clean_text) if sum_col else ""
    pp = pp2

# -----------------------
# Merge Books + Side-info
# -----------------------
books_small = books[["ISBN","Book-Title","Book-Author","Year-Of-Publication","Publisher"]].copy()
books_small.rename(columns={
    "Book-Title":"Title",
    "Book-Author":"Author",
    "Year-Of-Publication":"Year",
}, inplace=True)

# 1) Try ISBN join
merged = pd.merge(books_small, pp, on="ISBN", how="left")

# 2) For rows still missing Category/Language/Summary, try title fallback
mask_missing = merged["Category"].isna() | (merged["Category"]=="")
if "Title_pp" in merged.columns:
    # build a small map from Title_pp to side-info
    pp_title_map = pp[pp["Title_pp"]!=""][["Title_pp","Category","Language","Summary"]].drop_duplicates()
    # prepare merge on title (case-insensitive normalized)
    merged["Title_norm"] = merged["Title"].fillna("").str.strip().str.lower()
    pp_title_map["Title_norm"] = pp_title_map["Title_pp"].str.strip().str.lower()

    merged = pd.merge(
        merged.drop(columns=["Category","Language","Summary"]), 
        pp_title_map[["Title_norm","Category","Language","Summary"]],
        on="Title_norm", how="left"
    ).drop(columns=["Title_norm"])

# Fill NaNs with blanks
for c in ["Category","Language","Summary"]:
    if c in merged.columns:
        merged[c] = merged[c].fillna("")
    else:
        merged[c] = ""

# -----------------------
# Interaction Matrix (CSR)
# -----------------------
print("[INFO] Building user/book encoders and CSR interactions...")
# Filter clearly invalid ISBNs
valid_isbn_mask = merged["ISBN"].astype(str).str.len() > 0
merged_valid_isbn = set(merged.loc[valid_isbn_mask, "ISBN"].astype(str).tolist())

ratings = ratings[ratings["ISBN"].isin(merged_valid_isbn)].copy()
if ratings.empty:
    raise SystemExit("No ratings remain after filtering by valid ISBNs from books. Check data consistency.")

# Encoders
users_unique = ratings["User-ID"].astype(str).unique().tolist()
books_unique = ratings["ISBN"].astype(str).unique().tolist()

user2idx = {u:i for i,u in enumerate(users_unique)}
isbn2idx = {b:i for i,b in enumerate(books_unique)}

# CSR components
rows = ratings["User-ID"].map(user2idx).to_numpy(np.int64)
cols = ratings["ISBN"].map(isbn2idx).to_numpy(np.int64)
data = ratings["Book-Rating"].astype(np.float32).to_numpy()

n_users = len(users_unique)
n_items = len(books_unique)
R = csr_matrix((data, (rows, cols)), shape=(n_users, n_items))

# Also build a binarized version (explicit >= 6 as "liked")
data_bin = (ratings["Book-Rating"].to_numpy() >= 6).astype(np.float32)
R_bin = csr_matrix((data_bin, (rows, cols)), shape=(n_users, n_items))

# -----------------------
# Book metadata aligned to interactions
# -----------------------
meta = merged[merged["ISBN"].isin(books_unique)].copy()
meta = meta.drop_duplicates(subset=["ISBN"], keep="first")
meta = meta.set_index("ISBN").reindex(books_unique).reset_index()

titles_arr  = meta["Title"].fillna("").astype(str).to_list()
authors_arr = meta["Author"].fillna("").astype(str).to_list()
years_arr   = meta["Year"].fillna(0).astype(np.int32).to_list()
pubs_arr    = meta["Publisher"].fillna("").astype(str).to_list()
cats_arr    = meta["Category"].fillna("").astype(str).to_list()
langs_arr   = meta["Language"].fillna("").astype(str).to_list()

# -----------------------
# Stats / Small samples
# -----------------------
n_ratings = ratings.shape[0]
density = 100.0 * n_ratings / (n_users * n_items)

user_activity = np.asarray(R.getnnz(axis=1)).astype(int)
item_pop      = np.asarray(R.getnnz(axis=0)).astype(int)

top_items_idx = np.argsort(-item_pop)[:20]
top_items = [{
    "isbn": books_unique[i],
    "title": titles_arr[i],
    "author": authors_arr[i],
    "ratings_count": int(item_pop[i])
} for i in top_items_idx]

top_users_idx = np.argsort(-user_activity)[:20]
top_users = [{
    "user_id": users_unique[i],
    "ratings_count": int(user_activity[i])
} for i in top_users_idx]

summary = {
    "n_users": n_users,
    "n_items": n_items,
    "n_ratings": int(n_ratings),
    "matrix_density_percent": float(round(density, 6)),
    "avg_ratings_per_user": float(np.mean(user_activity)) if n_users>0 else 0.0,
    "avg_ratings_per_item": float(np.mean(item_pop)) if n_items>0 else 0.0,
    "top_items_by_count": top_items,
    "top_users_by_count": top_users,
}

# -----------------------
# SAVE: PKL (encoders, metadata, stats)
# -----------------------
artifact = {
    "user2idx": user2idx,
    "isbn2idx": isbn2idx,
    "users": users_unique,
    "items_isbn": books_unique,
    "items_title": titles_arr,
    "items_author": authors_arr,
    "items_year": years_arr,
    "items_publisher": pubs_arr,
    "items_category": cats_arr,
    "items_language": langs_arr,
    "summary": summary,
    "source_paths": {
        "preprocessed": PP_PATH,
        "bx_books": BX_BOOKS,
        "bx_ratings": BX_RATINGS,
        "bx_users": BX_USERS
    },
    "meta": {
        "python": sys.version,
        "platform": platform.platform(),
        "pandas": pd.__version__,
        "numpy": np.__version__
    }
}
with open(PKL_OUT, "wb") as f:
    pickle.dump(artifact, f)
print(f"[OK] Wrote PKL → {PKL_OUT}")

# -----------------------
# SAVE: H5 (CSR + metadata arrays)
# -----------------------
with h5py.File(H5_OUT, "w") as hf:
    dt = h5py.string_dtype(encoding="utf-8")

    # CSR for ratings (dense matrices are too big; store CSR pieces)
    grp_r = hf.create_group("ratings")
    grp_r.create_dataset("data",    data=R.data,    compression="gzip")
    grp_r.create_dataset("indices", data=R.indices, compression="gzip")
    grp_r.create_dataset("indptr",  data=R.indptr,  compression="gzip")
    grp_r.attrs["shape"] = R.shape

    grp_b = hf.create_group("ratings_binary")
    grp_b.create_dataset("data",    data=R_bin.data,    compression="gzip")
    grp_b.create_dataset("indices", data=R_bin.indices, compression="gzip")
    grp_b.create_dataset("indptr",  data=R_bin.indptr,  compression="gzip")
    grp_b.attrs["shape"] = R_bin.shape

    # Aligned metadata arrays (index-aligned with items/books_unique)
    hf.create_dataset("items_isbn",      data=np.array(books_unique, dtype=object), dtype=dt)
    hf.create_dataset("items_title",     data=np.array(titles_arr,   dtype=object), dtype=dt)
    hf.create_dataset("items_author",    data=np.array(authors_arr,  dtype=object), dtype=dt)
    hf.create_dataset("items_publisher", data=np.array(pubs_arr,     dtype=object), dtype=dt)
    hf.create_dataset("items_category",  data=np.array(cats_arr,     dtype=object), dtype=dt)
    hf.create_dataset("items_language",  data=np.array(langs_arr,    dtype=object), dtype=dt)
    hf.create_dataset("items_year",      data=np.array(years_arr,    dtype=np.int32))

    # Users (ID strings aligned with rows)
    hf.create_dataset("users", data=np.array(users_unique, dtype=object), dtype=dt)

    # Misc
    hf.attrs["created_unix"] = int(time.time())
    hf.attrs["notes"] = "CSR interactions + aligned metadata for Book-Crossing."

print(f"[OK] Wrote H5  → {H5_OUT}")

# -----------------------
# SAVE: YAML (config/schema)
# -----------------------
cfg = {
    "project": "AI Book Recommender with Mood Detection",
    "artifacts": {
        "pkl": PKL_OUT,
        "h5":  H5_OUT,
        "yaml": YAML_OUT,
        "json": JSON_OUT
    },
    "inputs": {
        "preprocessed_csv": PP_PATH,
        "bx_books_csv": BX_BOOKS,
        "bx_ratings_csv": BX_RATINGS,
        "bx_users_csv": BX_USERS
    },
    "schema": {
        "ratings": {
            "csr_groups": ["ratings", "ratings_binary"],
            "shape": [int(n_users), int(n_items)],
            "rating_scale": "0-10 (0=implicit); binary>=6 considered liked"
        },
        "book_meta_alignment": "All item arrays are index-aligned with items_isbn",
        "user_indexing": "Row i in CSR corresponds to users[i]"
    },
    "build_meta": {
        "python": sys.version,
        "platform": platform.platform(),
        "pandas": pd.__version__,
        "numpy": np.__version__
    }
}
with open(YAML_OUT, "w", encoding="utf-8") as f:
    yaml.dump(cfg, f, sort_keys=False)
print(f"[OK] Wrote YAML → {YAML_OUT}")

# -----------------------
# SAVE: JSON (summary + samples)
# -----------------------
sample_items = top_items[:5]
sample_users = top_users[:5]
with open(JSON_OUT, "w", encoding="utf-8") as f:
    json.dump({
        "summary": summary,
        "sample_top_items": sample_items,
        "sample_top_users": sample_users
    }, f, ensure_ascii=False, indent=2)
print(f"[OK] Wrote JSON → {JSON_OUT}")

print("\nAll artifacts created successfully ✅")
print(f"Users: {n_users:,} | Items: {n_items:,} | Ratings: {n_ratings:,} | Density: {summary['matrix_density_percent']:.6f}%")


[INFO] Loading BX files...
[INFO] Loading preprocessed side-info...
[INFO] Building user/book encoders and CSR interactions...
[OK] Wrote PKL → C:\Users\sagni\Downloads\AI Book Recommender with Mood Detection\bx_artifacts.pkl
[OK] Wrote H5  → C:\Users\sagni\Downloads\AI Book Recommender with Mood Detection\bx_interactions.h5
[OK] Wrote YAML → C:\Users\sagni\Downloads\AI Book Recommender with Mood Detection\bx_config.yaml
[OK] Wrote JSON → C:\Users\sagni\Downloads\AI Book Recommender with Mood Detection\bx_summary.json

All artifacts created successfully ✅
Users: 92,108 | Items: 270,170 | Ratings: 1,031,190 | Density: 0.004144%
