In [2]:
# Phase1 : Create Base dataframe from Kaggle
# https://www.kaggle.com/datasets/joshjms/kawaii


# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
manga_file_path = "manga.csv"

# Load the latest version of manga data
manga_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "joshjms/kawaii",
  manga_file_path,
)

# check
print("First 5 records:", manga_df.head())

  manga_df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/joshjms/kawaii?dataset_version_number=1&file_name=manga.csv...


100%|██████████| 2.53M/2.53M [00:00<00:00, 46.9MB/s]

Extracting zip of manga.csv...





First 5 records:    id                    title                 title_en  \
0   1                  Monster                  Monster   
1   2                  Berserk                  Berserk   
2   3        20th Century Boys        20th Century Boys   
3   4  Yokohama Kaidashi Kikou  Yokohama Kaidashi Kikou   
4   7           Hajime no Ippo                      NaN   

                                           image_url  chapters  publishing  \
0  https://cdn.myanimelist.net/images/manga/3/258...       162       False   
1  https://cdn.myanimelist.net/images/manga/1/157...         0        True   
2  https://cdn.myanimelist.net/images/manga/5/260...       249       False   
3  https://cdn.myanimelist.net/images/manga/1/171...       142       False   
4  https://cdn.myanimelist.net/images/manga/2/250...         0        True   

              published_from               published_to  score  scored_by  \
0  1994-12-05T00:00:00+00:00  2001-12-20T00:00:00+00:00   9.14      83708   
1  19

In [3]:
# Load the latest version of genre data
genre_file_path = "manga_genre.csv"
# Load the latest version
genre_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "joshjms/kawaii",
  genre_file_path,
)

print("First 5 records:", genre_df.head())

recommend_file_path = "recommendations.csv"

recommend_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "joshjms/kawaii",
recommend_file_path,
)

print("First 5 records:", recommend_df.head())


# mange_df : id, title, title_en, image_url, score, scored_by
# genre_df : id, genre a, genre b,c,d,e,, >>> if this id's manga is applicable, value is 1, otherwise 0.
# recommen_df : id1, id2, votes >>> Recommendation pairs of manga (votes is number of voting)



  genre_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'kawaii' dataset.
First 5 records:    id  Action  Adventure  Avant Garde  Award Winning  Boys Love  Comedy  \
0   1       0          0            0              1          0       0   
1   2       1          1            0              1          0       0   
2   3       0          0            0              1          0       0   
3   4       0          0            0              1          0       0   
4   7       0          0            0              1          0       0   

   Drama  Fantasy  Girls Love  ...  Vampire  Video Game  Villainess  \
0      1        0           0  ...        0           0           0   
1      1        1           0  ...        0           0           0   
2      1        0           0  ...        0           0           0   
3      1        0           0  ...        0           0           0   
4      0        0           0  ...        0           0           0   

   Visual Arts  Workplace  Josei  Kids  Sein

  recommend_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'kawaii' dataset.
First 5 records:    id1     id2  votes
0    1       3     12
1    1      21     11
2    1     745      9
3    1    1695      5
4    1  111213      4


In [4]:
# Phase2 : Create Json from Kaggle data


import pandas as pd
import json

# function to create nodes
def build_nodes(manga_df, genre_df):
    genre_cols = [c for c in genre_df.columns if c != "id"]

    merged = manga_df.merge(genre_df, on="id", how="left")

    nodes = []

    for _, row in merged.iterrows():
        genres = [g for g in genre_cols if row[g] == 1]

        node = {
            "id": str(row["id"]),
            "title": row["title"],
            "title_en": row.get("title_en", None),
            "image_url": row["image_url"],
            "score": float(row["score"]),
            "scored_by": int(row["scored_by"]),
            "genres": genres
        }
        nodes.append(node)

    return nodes


# generate edges
def build_edges(recommend_df):
    edges = []
    max_votes = recommend_df["votes"].max()

    for _, row in recommend_df.iterrows():
        strength = row["votes"] / max_votes if max_votes > 0 else 0

        edge = {
            "source": str(row["id1"]),
            "target": str(row["id2"]),
            "strength": strength
        }
        edges.append(edge)

    return edges

import math

# handling of Nan to avoid Json parse error
def replace_nan_with_empty(obj):
    if isinstance(obj, float) and math.isnan(obj):
        return ""
    elif isinstance(obj, dict):
        return {k: replace_nan_with_empty(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [replace_nan_with_empty(v) for v in obj]
    else:
        return obj


# Create graph and dump json
def build_graph_json_file(manga_df, genre_df, recommend_df, out_path="manga_graph.json"):

    graph_json = {
        "nodes": build_nodes(manga_df, genre_df),
        "edges": build_edges(recommend_df)
    }

    graph_json = replace_nan_with_empty(graph_json)

    with open(out_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(graph_json, ensure_ascii=False, indent=2).replace("NaN", "\"\""))

    print(f"Saved JSON to: {out_path}")


# run
build_graph_json_file(manga_df, genre_df, recommend_df, "output_graph.json")


Saved JSON to: output_graph.json


In [5]:
# Phase 3: Extracting recomendation data from Japanese web forum
#
#

In [6]:
# Cell 3-1: inital settings

!pip install requests beautifulsoup4 lxml pandas

import re
import time
import logging
from dataclasses import dataclass
from typing import List, Dict, Any

import requests
from bs4 import BeautifulSoup
import pandas as pd
import html as html_lib

# Enable logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)

# Base URL pattern for mimizun threads
BASE_URL = "https://mimizun.com/log/2ch/csaloon/{id}/"

THREADS = [
    {"label": "1",         "mimizun_id": "1162314830"},
    {"label": "2",         "mimizun_id": "1167454314"},
    {"label": "3",         "mimizun_id": "1171079611"},
    {"label": "4",         "mimizun_id": "1175783060"},
    {"label": "5",         "mimizun_id": "1183994746"},
    {"label": "6",         "mimizun_id": "1189727381"},
    {"label": "7",         "mimizun_id": "1195739456"},
    {"label": "8",         "mimizun_id": "1203148773"},
    {"label": "9",         "mimizun_id": "1208270446"},
    {"label": "10",        "mimizun_id": "1215256679"},
    {"label": "11",        "mimizun_id": "1222099691"},
    {"label": "12",        "mimizun_id": "1227707481"},
    {"label": "13",        "mimizun_id": "1232810053"},
    {"label": "14",        "mimizun_id": "1238494219"},
    {"label": "15",        "mimizun_id": "1244915034"},
    {"label": "16",        "mimizun_id": "1252108926"},
    {"label": "17",        "mimizun_id": "1262872569"},
    {"label": "17_alt",    "mimizun_id": "1276442702"},
    {"label": "18",        "mimizun_id": "1293119158"},
    {"label": "19",        "mimizun_id": "1306247792"},
    {"label": "20",        "mimizun_id": "1318432990"},
    {"label": "21",        "mimizun_id": "1332259771"},
    {"label": "22",        "mimizun_id": "1350228432"},
    {"label": "22_5",      "mimizun_id": "1357429242"},
    {"label": "22_5_alt",  "mimizun_id": "1359363856"},
]

# Condition to see if a post contains candidates of manga titles
MIN_LINES_FOR_A = 5

# Be polite to the server
REQUEST_SLEEP_SEC = 1.0



In [7]:
# Phase 3-2: fetch HTML from mimizun (Japanese Forum)

def fetch_mimizun_page(url: str) -> str:
    # Fetch HTML from mimizun with a browser-like User-Agent.
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0 Safari/537.36"
        )
    }
    logging.info(f"GET {url}")
    resp = requests.get(url, headers=headers, timeout=30)
    logging.info(f"Status code: {resp.status_code}")
    resp.raise_for_status()

    # Let requests choose encoding or fallback to Shift_JIS
    if resp.encoding is None:
        resp.encoding = "shift_jis"

    html = resp.text
    logging.info(f"Fetched {len(html)} chars")
    return html

In [8]:
# Phase 3-3: helpers (unwrap source-view HTML, parse, classify, extract candidates)
# ----------mainly generated by AI (Gemeni)------------

def unwrap_source_view_if_needed(html: str) -> str:
    """
    If the page is a 'HTML source viewer' (escaped HTML inside <td class="line-content">),
    extract the inner real HTML and return it. Otherwise, return the original HTML.
    """
    soup = BeautifulSoup(html, "lxml")

    td = soup.find("td", class_="line-content")
    if not td:
        # Not a source-view wrapper
        return html

    # The inner text is escaped HTML like &lt;html&gt;...&lt;/html&gt;
    inner_text = td.get_text()
    real_html = html_lib.unescape(inner_text)

    logging.info("[UNWRAP] Detected source-view wrapper. Extracted inner HTML.")
    logging.info(f"[UNWRAP] Inner HTML length: {len(real_html)} chars")
    return real_html


def extract_lines_from_post(body: str) -> List[str]:
    """
    Split a post body into candidate lines.
    We keep sentence-like lines as well (some manga titles look like sentences).
    """
    lines = []
    for line in body.splitlines():
        line = line.strip()
        if not line:
            continue

        # Skip pure reply anchors like ">>1" or ">>12-15"
        if re.match(r"^>>\d+([\-\,]\d+)*$", line):
            continue

        lines.append(line)
    return lines


@dataclass
class Post:
    thread_label: str
    mimizun_id: str
    thread_url: str
    post_no: int
    meta_text: str
    body_text: str
    lines: List[str]
    reply_to_numbers: List[int]
    post_type: str = "?"  # "A", "B", or "?"


def parse_thread(html: str, thread_label: str, mimizun_id: str) -> List[Post]:
    """
    Parse a mimizun thread into Post objects.

    Priority:
      1) <div id="thread"> with <div class="contributor"> + <div class="res">
      2) Fallback: <dl>/<dt>/<dd> structure (legacy logs)
    """
    logging.info(f"[PARSE] thread {thread_label} ({mimizun_id})")

    # First, unwrap if this is a "source-view" page
    html = unwrap_source_view_if_needed(html)

    soup = BeautifulSoup(html, "lxml")
    url = BASE_URL.format(id=mimizun_id)

    # Try contributor/res structure
    thread_div = soup.find("div", id="thread")
    if thread_div is not None:
        posts: List[Post] = []
        contributors = thread_div.find_all("div", class_="contributor")
        logging.info(f"[PARSE] Found {len(contributors)} <div class='contributor'> tags")

        for contrib in contributors:
            meta_text = contrib.get_text(" ", strip=True)

            # Extract post number from meta (e.g. "1 ：マロン名無しさん：...")
            m = re.match(r"\s*(\d+)", meta_text)
            if not m:
                logging.warning(f"[PARSE] Could not parse post number from contributor: {meta_text[:50]}")
                continue
            post_no = int(m.group(1))

            res_div = contrib.find_next_sibling("div", class_="res")
            if res_div is None:
                logging.warning(f"[PARSE] No <div class='res'> for post_no={post_no} in thread {thread_label}")
                continue

            body = res_div.get_text("\n", strip=True)

            reply_nums = set(int(n) for n in re.findall(r">>(\d+)", body))
            reply_nums.update(int(n) for n in re.findall(r">>(\d+)", meta_text))
            reply_nums = sorted(reply_nums)

            lines = extract_lines_from_post(body)

            posts.append(
                Post(
                    thread_label=thread_label,
                    mimizun_id=mimizun_id,
                    thread_url=url,
                    post_no=post_no,
                    meta_text=meta_text,
                    body_text=body,
                    lines=lines,
                    reply_to_numbers=reply_nums,
                )
            )

        logging.info(f"[PARSE] Parsed {len(posts)} posts (contributor/res style)")
        return posts

    # Fallback: try dl/dt/dd style
    logging.warning("[PARSE] No <div id='thread'> found. Falling back to <dl>/<dt>/<dd> parser.")
    dl = soup.find("dl", class_="thread") or soup.find("dl")
    if dl is None:
        logging.error("[PARSE] No <dl> or <div id='thread'> found. Cannot parse this thread.")
        return []

    posts: List[Post] = []
    dt_tags = dl.find_all("dt")
    logging.info(f"[PARSE][FALLBACK] Found {len(dt_tags)} <dt> tags")

    for dt in dt_tags:
        meta_text = dt.get_text(" ", strip=True)
        m = re.match(r"\s*(\d+)", meta_text)
        if not m:
            continue
        post_no = int(m.group(1))

        dd = dt.find_next_sibling("dd")
        if dd is None:
            continue

        body = dd.get_text("\n", strip=True)
        reply_nums = sorted({int(n) for n in re.findall(r">>(\d+)", body)})
        lines = extract_lines_from_post(body)

        posts.append(
            Post(
                thread_label=thread_label,
                mimizun_id=mimizun_id,
                thread_url=url,
                post_no=post_no,
                meta_text=meta_text,
                body_text=body,
                lines=lines,
                reply_to_numbers=reply_nums,
            )
        )

    logging.info(f"[PARSE][FALLBACK] Parsed {len(posts)} posts (dl/dd style)")
    return posts


def classify_posts_A_B(posts: List[Post]) -> None:
    """
    Classify posts into:
      A: posts listing favorite manga (>= MIN_LINES_FOR_A lines)
      B: posts replying to any A post (via >>N)
    """
    A_numbers = set()

    # First pass: mark A
    for p in posts:
        if len(p.lines) >= MIN_LINES_FOR_A:
            p.post_type = "A"
            A_numbers.add(p.post_no)

    # Second pass: mark B
    for p in posts:
        if p.post_type == "A":
            continue
        if any(n in A_numbers for n in p.reply_to_numbers):
            p.post_type = "B"

    logging.info(
        f"[CLASSIFY] A={sum(p.post_type=='A' for p in posts)}, "
        f"B={sum(p.post_type=='B' for p in posts)}, "
        f"Other={sum(p.post_type not in ['A','B'] for p in posts)}"
    )


def extract_candidates(posts: List[Post]) -> List[Dict[str, Any]]:
    """
    For posts of type A or B, extract each line as a manga title candidate.
    """
    output: List[Dict[str, Any]] = []
    for p in posts:
        if p.post_type not in ("A", "B"):
            continue

        for idx, line in enumerate(p.lines):
            output.append(
                {
                    "thread_label": p.thread_label,
                    "mimizun_id": p.mimizun_id,
                    "thread_url": p.thread_url,
                    "post_no": p.post_no,
                    "post_type": p.post_type,  # "A" or "B"
                    "line_index": idx,
                    "line_text": line,
                    "reply_to_post_nos": ",".join(map(str, p.reply_to_numbers)),
                    # manual annotation columns
                    "is_manga": "",
                    "canonical_id": "",
                    "note": "",
                }
            )
    return output


In [9]:
# Phase 3-4: run pipeline for all threads and save candidates CSV


all_candidates: List[Dict[str, Any]] = []

for th in THREADS:
    label = th["label"]
    mid = th["mimizun_id"]
    url = BASE_URL.format(id=mid)

    logging.info("=" * 80)
    logging.info(f"[THREAD] Processing thread label={label}, mimizun_id={mid}")

    try:
        html = fetch_mimizun_page(url)
    except Exception as e:
        logging.error(f"[ERROR] Failed to fetch {url}: {e}")
        continue

    # Save raw HTML for later debugging
    raw_name = f"raw_thread_{label}_{mid}.html"
    with open(raw_name, "w", encoding="utf-8", errors="ignore") as f:
        f.write(html)
    logging.info(f"[SAVE] Saved raw HTML to {raw_name}")

    # Parse, classify, extract
    posts = parse_thread(html, thread_label=label, mimizun_id=mid)
    logging.info(f"[THREAD] Parsed {len(posts)} posts")

    classify_posts_A_B(posts)
    candidates = extract_candidates(posts)
    logging.info(f"[CANDIDATES] Extracted {len(candidates)} raw candidates in thread {label}")

    all_candidates.extend(candidates)

    time.sleep(REQUEST_SLEEP_SEC)

# Combine all candidates into a DataFrame
df = pd.DataFrame(all_candidates)

if not df.empty:
    logging.info(f"[SUMMARY] Raw candidates (with duplicates): {len(df)}")

    # Remove exact duplicates by line_text (dictionary should not contain duplicates)
    df = df.drop_duplicates(subset=["line_text"], keep="first").reset_index(drop=True)
    logging.info(f"[SUMMARY] Candidates after exact dedup by line_text: {len(df)}")

    # Reassign candidate_id after dedup
    df.insert(0, "candidate_id", range(1, len(df) + 1))

    out_csv = "manga_candidates_mimizun.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8-sig")
    logging.info(f"[OUTPUT] Saved candidate CSV (unique by line_text): {out_csv}")
else:
    logging.warning("[OUTPUT] No candidates extracted. Please check logs.")


In [10]:
# Phase 4 (totally done by manual)
# I manually inspected each line of 'manga_candidates_mimizun.csv' generated in Phase3 (1000+ line).
# This CSV have candidates of manga titles in Japanese.
# I checked them if they are manga titles, and if yes, I map them to english titles in the Kaggle Data from Phase1.

# [From Kaggle Data]export id, title, and title_en to csv file
manga_df[["id", "title", "title_en"]].to_csv("manga_title_kaggle.csv", index=False)

In [11]:
# Phase 5 : Generate Addtional Edge from Japanese form data

In [12]:
# Mannually Generated CSV file >>>>>>
# https://drive.google.com/file/d/1MilyhmmBj3bl8PJkGttF_pUJPs3MJ1BR/view?usp=sharing
file_id = "1MilyhmmBj3bl8PJkGttF_pUJPs3MJ1BR"
INPUT = f"https://drive.google.com/uc?id={file_id}"

df_dict_titles = pd.read_csv(INPUT, encoding="utf-8", on_bad_lines="skip")




In [13]:
df_dict_titles.head()
# showing the japanese titles associated with kaggle_id

Unnamed: 0,line_text,fixed_kaggle_id,note
0,バガボンド,656,
1,クピドの悪戯,12776,
2,涼宮ハルヒの憂鬱,3083,
3,ひぐらしのなく頃に,1262,
4,デスノート,21,


In [14]:
print(f"Original df shape: {df.shape}")
print(f"Number of unique line_text in df: {df['line_text'].nunique()}")

Original df shape: (32522, 12)
Number of unique line_text in df: 32522


In [15]:
# combine two dataframe to link japanese titles to english titles
merged_df = pd.merge(df, df_dict_titles, on='line_text', how='left')
filtered_df = merged_df[merged_df['fixed_kaggle_id'].notna()]
display(filtered_df.head())

Unnamed: 0,candidate_id,thread_label,mimizun_id,thread_url,post_no,post_type,line_index,line_text,reply_to_post_nos,is_manga,canonical_id,note_x,fixed_kaggle_id,note_y
1,2,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,1,バガボンド,2,,,,656.0,
2,3,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,2,クピドの悪戯,2,,,,12776.0,
3,4,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,3,涼宮ハルヒの憂鬱,2,,,,3083.0,
4,5,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,4,ひぐらしのなく頃に,2,,,,1262.0,
5,6,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,5,デスノート,2,,,,21.0,


In [16]:
# Extracts titles from the forum dataset using regex and maps the corresponding IDs from the dictionary.
# ----------mainly generated by AI (Gemeni)------------

import pandas as pd
import re

# Create Regular Expression Pattern
# Sort by length in descending order (e.g., to ensure "ONE PIECE" is matched before "ONE")
titles = sorted(df_dict_titles['line_text'].dropna().unique(), key=len, reverse=True)

# Escape special characters in titles (parentheses, dots, etc.)
# Join with '|' to create a massive regex pattern "TitleA|TitleB|TitleC"
pattern = '|'.join([re.escape(t) for t in titles])

# 2. Extract title from post text
df['extracted_title'] = df['line_text'].str.extract(f'({pattern})', expand=False)

# Check
print("Count of extracted items:", df['extracted_title'].notna().sum())
display(df[['line_text', 'extracted_title']].head())

# 3. Merge IDs using the extracted title as a key
merged_df = pd.merge(
    df,
    df_dict_titles[['line_text', 'fixed_kaggle_id']],
    left_on='extracted_title',
    right_on='line_text',
    how='left'
)

# Post-merge cleanup (rename columns and remove the duplicated 'line_text' used as key)
merged_df = merged_df.rename(columns={'line_text_x': 'line_text'}).drop(columns=['line_text_y'], errors='ignore')

display(merged_df.head())

Count of extracted items: 5116


Unnamed: 0,line_text,extracted_title
0,別に被ってないよね。,
1,バガボンド,バガボンド
2,クピドの悪戯,クピドの悪戯
3,涼宮ハルヒの憂鬱,涼宮ハルヒの憂鬱
4,ひぐらしのなく頃に,ひぐらしのなく頃に


Unnamed: 0,candidate_id,thread_label,mimizun_id,thread_url,post_no,post_type,line_index,line_text,reply_to_post_nos,is_manga,canonical_id,note,extracted_title,fixed_kaggle_id
0,1,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,0,別に被ってないよね。,2,,,,,
1,2,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,1,バガボンド,2,,,,バガボンド,656.0
2,3,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,2,クピドの悪戯,2,,,,クピドの悪戯,12776.0
3,4,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,3,涼宮ハルヒの憂鬱,2,,,,涼宮ハルヒの憂鬱,3083.0
4,5,1,1162314830,https://mimizun.com/log/2ch/csaloon/1162314830/,3,A,4,ひぐらしのなく頃に,2,,,,ひぐらしのなく頃に,1262.0


In [17]:
# ----------mainly generated by AI (Gemeni)------------

import pandas as pd
import itertools
from collections import Counter

def create_manga_weighted_edge_list(df, output_path):
    # delete rows without fixed_kaggle_id and convert to int
    df = df.dropna(subset=['fixed_kaggle_id']).copy()
    df['fixed_kaggle_id'] = df['fixed_kaggle_id'].astype(int)

    # 1. generate a map to exlpore
    post_map = {}
    for _, row in df.iterrows():
        key = (row['thread_label'], row['post_no'])
        if key not in post_map:
            post_map[key] = set()
        post_map[key].add(row['fixed_kaggle_id'])

    edge_counter = Counter()

    # Pattern A: linking titles in the same post
    # (Same thread_label and post_no)
    for ids in post_map.values():
        if len(ids) > 1:
            for u, v in itertools.combinations(ids, 2):
                if u != v:
                    edge = tuple(sorted((u, v)))
                    edge_counter[edge] += 1

    # Pattern B: Linking titles recomended by other posts
    for _, row in df.iterrows():
        reply_val = row['reply_to_post_nos']

        # Skip is the post is not replying one
        if pd.isna(reply_val) or reply_val == "":
            continue

        # identify the target post replied by this post
        target_post_nos = []
        try:
            if isinstance(reply_val, str):
                target_post_nos = [int(x.strip()) for x in reply_val.split(',') if x.strip().isdigit()]
            elif isinstance(reply_val, (int, float)):
                target_post_nos = [int(reply_val)]
        except ValueError:
            continue

        if not target_post_nos:
            continue

        src_id = row['fixed_kaggle_id']
        thread_lbl = row['thread_label']

        # generate edges
        for p_no in target_post_nos:
            target_key = (thread_lbl, p_no)

            if target_key in post_map:
                target_ids = post_map[target_key]
                for tgt_id in target_ids:
                    if src_id != tgt_id:
                        edge = tuple(sorted((src_id, tgt_id)))
                        edge_counter[edge] += 1

    # Convert Counter to DataFrame
    edge_data = [{'id1': k[0], 'id2': k[1], 'votes': v} for k, v in edge_counter.items()]
    df_edges = pd.DataFrame(edge_data)

    # sort
    if not df_edges.empty:
        df_edges = df_edges.sort_values(by='votes', ascending=False).reset_index(drop=True)

    # export as CSV
    df_edges.to_csv(output_path, index=False)

    print(f"{len(df_edges)} edges are generated")
    return df_edges

In [18]:
added_reco_df = create_manga_weighted_edge_list(filtered_df, "to-be-addeded-manga_weighted_edges.csv")
added_reco_df.head()

965 edges are generated


Unnamed: 0,id1,id2,votes
0,564,3084,3
1,564,735,3
2,26,564,3
3,564,661,3
4,564,5227,3


In [19]:
# function to merge two edge lists
# mainly generated by AI
import numpy as np
def merge_weighted_edge_lists(df1, df2):
    combined = pd.concat([df1, df2], ignore_index=True)

    ids = combined[['id1', 'id2']].values
    combined['id1'] = np.minimum(ids[:, 0], ids[:, 1])
    combined['id2'] = np.maximum(ids[:, 0], ids[:, 1])

    merged_df = combined.groupby(['id1', 'id2'], as_index=False)['votes'].sum()

    merged_df = merged_df.sort_values(by='votes', ascending=False).reset_index(drop=True)

    return merged_df

In [20]:
df_merged_reco = merge_weighted_edge_lists(recommend_df, added_reco_df)
df_merged_reco.head()

Unnamed: 0,id1,id2,votes
0,2,583,88
1,2,656,68
2,113138,116778,66
3,2,642,52
4,13,598,52


In [21]:
# update the graph json file
build_graph_json_file(manga_df, genre_df, df_merged_reco, "new_output_graph.json")

Saved JSON to: new_output_graph.json
