In [2]:
import os
import pandas as pd

In [33]:
df = pd.read_csv('/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv')

In [34]:
df

Unnamed: 0,user_id,book_id,rating,decade,original_title,authors,genres
0,1,258,5,2000,La sombra del viento,"Carlos Ruiz Zafón, Lucia Graves","Mystery, Historical"
1,2,4081,4,2000,,,
2,2,260,5,1930,How to Win Friends and Influence People,Dale Carnegie,"Nonfiction, Drama"
3,2,9296,5,1970,Das Drama des begabten Kindes und die Suche na...,"Alice Miller, Ruth Ward","Horror, Mystery"
4,2,2318,3,1990,The Millionaire Next Door: The Surprising Secr...,"Thomas J. Stanley, William D. Danko","Nonfiction, Drama"
...,...,...,...,...,...,...,...
5976474,49925,510,5,1990,The Great Hunt,Robert Jordan,"Fantasy, Adventure"
5976475,49925,528,4,1990,The Dragon Reborn,Robert Jordan,"Classics, Drama"
5976476,49925,722,4,1990,The Shadow Rising,Robert Jordan,"Adventure, Drama"
5976477,49925,949,5,1990,The Fires of Heaven,Robert Jordan,"Fantasy, Adventure"


In [36]:
import pandas as pd
import numpy as np

# --- Your canonical 13 genres, fixed order for rows/cols ---
GENRES_13 = [
    "Adult", "Adventure", "Children's", "Classics", "Drama",
    "Fantasy", "Historical", "Horror", "Mystery", "Nonfiction",
    "Romance", "Science Fiction", "Thriller"
]

def parse_genres(s):
    if pd.isna(s):
        return []
    return [g.strip() for g in str(s).split(",") if g.strip()]

# 1) Collapse to UNIQUE books and keep one ordered genre list per book
#    (first non-null genres string per book_id, preserving its order)
books = (
    df[['book_id', 'genres']]
      .dropna(subset=['book_id'])
      .sort_values('book_id')
      .drop_duplicates(subset=['book_id'], keep='first')
      .copy()
)

# Parse + filter to canonical 13, preserving order within each book
books['genre_list'] = books['genres'].apply(parse_genres).apply(
    lambda gl: [g for g in gl if g in GENRES_13]
)
books = books[books['genre_list'].map(len) > 0].copy()

# 2) Count and print how many UNIQUE canonical genres are actually present in df
present = []
for gl in books['genre_list']:
    present.extend(gl)
present = sorted(set(present), key=lambda g: GENRES_13.index(g))

print(f"Unique genres present (from canonical 13): {len(present)} / 13")
print(present)

# 3) Build an ORDER-SENSITIVE co-occurrence matrix (Gi -> Gj), Gi != Gj only
idx = {g: i for i, g in enumerate(GENRES_13)}
n = len(GENRES_13)
co_mat = np.zeros((n, n), dtype=int)

for gl in books['genre_list']:
    # First occurrence positions per genre in this book (preserve order)
    pos = {}
    for k, g in enumerate(gl):
        if g not in pos:
            pos[g] = k

    # Count ordered pairs where Gi occurs BEFORE Gj (Gi != Gj). No diagonal.
    genres_in_book = list(pos.keys())
    for gi in genres_in_book:
        for gj in genres_in_book:
            if gi == gj:
                continue
            if pos[gi] < pos[gj]:
                co_mat[idx[gi], idx[gj]] += 1

# Wrap into a 13x13 DataFrame with fixed order (diagonal is guaranteed 0)
co_df = pd.DataFrame(co_mat, index=GENRES_13, columns=GENRES_13)

# Show a quick peek (optional)
display(co_df)

# Optional: save to CSV
# co_df.to_csv("ordered_genre_cooccurrence_unique_books_13x13.csv", index=True)


Unique genres present (from canonical 13): 13 / 13
['Adult', 'Adventure', "Children's", 'Classics', 'Drama', 'Fantasy', 'Historical', 'Horror', 'Mystery', 'Nonfiction', 'Romance', 'Science Fiction', 'Thriller']


Unnamed: 0,Adult,Adventure,Children's,Classics,Drama,Fantasy,Historical,Horror,Mystery,Nonfiction,Romance,Science Fiction,Thriller
Adult,0,0,0,4,91,0,2,0,3,6,0,0,0
Adventure,0,0,9,46,35,6,17,1,27,14,1,6,23
Children's,0,284,0,137,93,66,30,4,56,11,13,0,0
Classics,1,17,5,0,286,11,10,6,23,25,7,1,0
Drama,53,16,1,83,0,1,21,0,26,18,8,0,2
Fantasy,2,812,99,14,98,0,35,193,236,16,232,17,40
Historical,3,31,9,14,286,4,0,1,38,44,59,1,7
Horror,0,6,8,15,33,42,0,0,194,3,23,6,97
Mystery,5,45,11,37,246,14,67,38,0,3,50,0,799
Nonfiction,18,62,6,88,585,7,35,1,19,0,10,41,6


In [37]:
import pandas as pd
import numpy as np
from itertools import combinations

# --- Fixed 13-genre vocabulary (rows/cols will be exactly these 13) ---
GENRES_13 = [
    "Adult", "Adventure", "Children's", "Classics", "Drama",
    "Fantasy", "Historical", "Horror", "Mystery", "Nonfiction",
    "Romance", "Science Fiction", "Thriller"
]

def parse_genres(s: str):
    if pd.isna(s):
        return []
    return [g.strip() for g in str(s).split(",") if g.strip()]

# 1) Collapse to UNIQUE books; keep one genres string per book_id
books = (
    df[['book_id', 'genres']]
      .dropna(subset=['book_id'])
      .sort_values('book_id')
      .drop_duplicates(subset=['book_id'], keep='first')
      .copy()
)

# 2) Parse & filter to canonical 13; also de-duplicate within a book
def to_canonical_set(gstr):
    glist = [g for g in parse_genres(gstr) if g in GENRES_13]
    # de-duplicate while (optionally) preserving first-seen order
    return list(dict.fromkeys(glist))

books['genre_list'] = books['genres'].apply(to_canonical_set)
books = books[books['genre_list'].map(len) > 0].copy()

# 3) Report how many of the canonical 13 actually appear
present = sorted(set(g for gl in books['genre_list'] for g in gl), key=lambda g: GENRES_13.index(g))
print(f"Unique genres present (from canonical 13): {len(present)} / 13")
print(present)

# 4) Build a SYMMETRIC co-occurrence matrix over UNIQUE books
#    Cell (Gi, Gj) = number of UNIQUE books that contain BOTH Gi and Gj (order-agnostic)
idx = {g: i for i, g in enumerate(GENRES_13)}
n = len(GENRES_13)
co_mat = np.zeros((n, n), dtype=int)

for gl in books['genre_list']:
    s = list(set(gl))           # ensure uniqueness before making pairs
    for g1, g2 in combinations(s, 2):
        i, j = idx[g1], idx[g2]
        # increment both directions to enforce symmetry
        co_mat[i, j] += 1
        co_mat[j, i] += 1

# Zero the diagonal (no GiGi counts)
np.fill_diagonal(co_mat, 0)

# 5) Wrap in DataFrame
co_df = pd.DataFrame(co_mat, index=GENRES_13, columns=GENRES_13)

# Peek / use
display(co_df)

# Optional: save
# co_df.to_csv("symmetric_genre_cooccurrence_unique_books_13x13.csv", index=True)


Unique genres present (from canonical 13): 13 / 13
['Adult', 'Adventure', "Children's", 'Classics', 'Drama', 'Fantasy', 'Historical', 'Horror', 'Mystery', 'Nonfiction', 'Romance', 'Science Fiction', 'Thriller']


Unnamed: 0,Adult,Adventure,Children's,Classics,Drama,Fantasy,Historical,Horror,Mystery,Nonfiction,Romance,Science Fiction,Thriller
Adult,0,0,0,5,144,2,5,0,8,24,139,0,4
Adventure,0,0,293,63,51,818,48,7,72,76,19,264,78
Children's,0,293,0,142,94,165,39,12,67,17,22,12,0
Classics,5,63,142,0,369,25,24,21,60,113,44,35,0
Drama,144,51,94,369,0,99,307,33,272,603,941,65,28
Fantasy,2,818,165,25,99,0,39,235,250,23,351,41,40
Historical,5,48,39,24,307,39,0,1,105,79,189,8,13
Horror,0,7,12,21,33,235,1,0,232,4,55,46,123
Mystery,8,72,67,60,272,250,105,232,0,22,257,130,1088
Nonfiction,24,76,17,113,603,23,79,4,22,0,30,69,11


In [40]:
import re
import pandas as pd

# collapse to unique books
books_raw = (
    df[['book_id', 'original_title', 'authors', 'genres']]
      .dropna(subset=['book_id'])
      .sort_values('book_id')
      .drop_duplicates(subset=['book_id'], keep='first')
      .copy()
)

def split_genres(s):
    if pd.isna(s):
        return []
    return [g.strip() for g in str(s).split(",") if g.strip()]

tmp = books_raw.dropna(subset=['genres']).copy()
tmp['genre_list_raw'] = tmp['genres'].apply(split_genres)

def has_both(gl, a, b):
    s = set(gl)
    return a in s and b in s

pair = ("Adventure", "Fantasy")
mask_pair = tmp['genre_list_raw'].apply(lambda gl: has_both(gl, *pair))
subset = tmp[mask_pair]

print(f"Books with BOTH {pair[0]} and {pair[1]}:", len(subset))
display(subset[['book_id','original_title','authors','genres']].head(20))


Books with BOTH Adventure and Fantasy: 818


Unnamed: 0,book_id,original_title,authors,genres
3246005,6,The Fault in Our Stars,John Green,"Fantasy, Adventure"
1635898,19,The Fellowship of the Ring,J.R.R. Tolkien,"Fantasy, Adventure"
123194,21,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré","Fantasy, Adventure"
3940906,23,Harry Potter and the Chamber of Secrets,"J.K. Rowling, Mary GrandPré","Fantasy, Adventure"
2917592,24,Harry Potter and the Goblet of Fire,"J.K. Rowling, Mary GrandPré","Fantasy, Adventure"
2740016,25,Harry Potter and the Deathly Hallows,"J.K. Rowling, Mary GrandPré","Fantasy, Adventure"
3034951,27,Harry Potter and the Half-Blood Prince,"J.K. Rowling, Mary GrandPré","Fantasy, Adventure"
3679511,36,The Giver,Lois Lowry,"Fantasy, Adventure"
576279,38,The Time Traveler's Wife,Audrey Niffenegger,"Fantasy, Adventure"
427492,41,The Lightning Thief,Rick Riordan,"Fantasy, Adventure"


## adding 