In [2]:
import os
import pandas as pd

In [7]:
df = pd.read_csv('/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv')

In [5]:
df

Unnamed: 0,user_id,book_id,rating,decade,original_title,authors,genres
0,1,258,5,2000,La sombra del viento,"Carlos Ruiz Zafón, Lucia Graves","Mystery, Historical"
1,2,4081,4,2000,,,
2,2,260,5,1930,How to Win Friends and Influence People,Dale Carnegie,"Nonfiction, Drama"
3,2,9296,5,1970,Das Drama des begabten Kindes und die Suche na...,"Alice Miller, Ruth Ward","Horror, Mystery"
4,2,2318,3,1990,The Millionaire Next Door: The Surprising Secr...,"Thomas J. Stanley, William D. Danko","Nonfiction, Drama"
...,...,...,...,...,...,...,...
5976474,49925,510,5,1990,The Great Hunt,Robert Jordan,"Fantasy, Adventure"
5976475,49925,528,4,1990,The Dragon Reborn,Robert Jordan,"Classics, Drama"
5976476,49925,722,4,1990,The Shadow Rising,Robert Jordan,"Adventure, Drama"
5976477,49925,949,5,1990,The Fires of Heaven,Robert Jordan,"Fantasy, Adventure"


In [10]:
#!/usr/bin/env python3
import os
from pathlib import Path
from collections import defaultdict
import pandas as pd
from typing import Union

# ====== CONFIG ======
# You can give a string/Path to a CSV file, a pandas.DataFrame, or a file-like object.
INPUT_SOURCE: Union[str, Path, pd.DataFrame] = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv"
OUTPUT_CSV = "genre_counts_all.csv"
# ====================

def load_df(source: Union[str, Path, pd.DataFrame]):
    """Load a DataFrame from a path or return it if already a DataFrame."""
    if isinstance(source, pd.DataFrame):
        return source
    if isinstance(source, (str, Path)):
        # Ensure it's a proper path-like (string or Path), not a method
        src = str(source)
        if not src or src.strip() == "":
            raise ValueError("Empty path provided for INPUT_SOURCE.")
        if not (src.startswith(("http://", "https://")) or os.path.exists(src)):
            raise FileNotFoundError(f"Path not found: {src}")
        return pd.read_csv(src, low_memory=False)
    # File-like objects with .read
    if hasattr(source, "read"):
        return pd.read_csv(source, low_memory=False)
    raise TypeError(
        "INPUT_SOURCE must be a DataFrame, path-like (str/Path), or file-like object. "
        f"Got: {type(source)}"
    )

def split_genres(val):
    if pd.isna(val):
        return []
    # split, strip, and keep order while de-duplicating
    seen = set()
    out = []
    for g in str(val).split(","):
        g2 = g.strip()
        if g2 and g2 not in seen:
            out.append(g2)
            seen.add(g2)
    return out

def main():
    df = load_df(INPUT_SOURCE)

    # Keep one row per book
    if "book_id" not in df.columns or "genres" not in df.columns:
        raise KeyError("Input must contain 'book_id' and 'genres' columns.")
    books = (
        df[["book_id", "genres"]]
        .dropna(subset=["book_id"])
        .drop_duplicates(subset="book_id", keep="first")
        .copy()
    )
    books["genre_list"] = books["genres"].apply(split_genres)

    genre_total = defaultdict(int)
    genre_first = defaultdict(int)
    genre_later = defaultdict(int)

    for genres in books["genre_list"]:
        if not genres:
            continue
        # count first position
        genre_first[genres[0]] += 1
        # count totals
        for g in genres:
            genre_total[g] += 1
        # count later positions (2nd+)
        for g in genres[1:]:
            genre_later[g] += 1

    # Build results table for ALL genres seen
    all_genres = sorted(genre_total.keys())
    rows = []
    for g in all_genres:
        rows.append(
            {
                "genre": g,
                "total_books_with_genre": genre_total[g],
                "as_first_genre": genre_first.get(g, 0),
                "as_later_genre": genre_later.get(g, 0),
            }
        )
    result_df = pd.DataFrame(rows).sort_values(
        by=["total_books_with_genre", "as_first_genre"], ascending=[False, False]
    )

    # Show a quick preview
    print(result_df.head(20))
    # Save to CSV
    result_df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n✅ Saved: {OUTPUT_CSV}")

if __name__ == "__main__":
    main()


              genre  total_books_with_genre  as_first_genre  as_later_genre
4             Drama                    3006             229            2777
8           Mystery                    2563            1315            1248
10          Romance                    2131            1704             427
5           Fantasy                    2088            1794             294
1         Adventure                    1789             185            1604
12         Thriller                    1606             418            1188
9        Nonfiction                    1071             878             193
3          Classics                     901             392             509
2        Children's                     863             694             169
6        Historical                     857             497             360
11  Science Fiction                     855             776              79
7            Horror                     769             427             342
0           