In [2]:
import requests
import json

ENDPOINT = "https://graphql.anilist.co"

INTROSPECTION_QUERY = """
query {
  __type(name: "Media") {
    name
    fields {
      name
      description
      type {
        kind
        name
        ofType {
          kind
          name
        }
      }
    }
  }
}
"""

resp = requests.post(
    ENDPOINT,
    json={"query": INTROSPECTION_QUERY},
    headers={"Content-Type": "application/json"},
)
print(resp.text)

{"data":{"__type":{"name":"Media","fields":[{"name":"id","description":"The id of the media","type":{"kind":"NON_NULL","name":null,"ofType":{"kind":"SCALAR","name":"Int"}}},{"name":"idMal","description":"The mal id of the media","type":{"kind":"SCALAR","name":"Int","ofType":null}},{"name":"title","description":"The official titles of the media in various languages","type":{"kind":"OBJECT","name":"MediaTitle","ofType":null}},{"name":"type","description":"The type of the media; anime or manga","type":{"kind":"ENUM","name":"MediaType","ofType":null}},{"name":"format","description":"The format the media was released in","type":{"kind":"ENUM","name":"MediaFormat","ofType":null}},{"name":"status","description":"The current releasing status of the media","type":{"kind":"ENUM","name":"MediaStatus","ofType":null}},{"name":"description","description":"Short description of the media's story and characters","type":{"kind":"SCALAR","name":"String","ofType":null}},{"name":"startDate","description":"

In [2]:
import requests
import json
from pathlib import Path

ENDPOINT = "https://graphql.anilist.co"

QUERY = """
query ($page: Int!, $perPage: Int!) {
  Page(page: $page, perPage: $perPage) {
    media(
      type: ANIME
      sort: POPULARITY_DESC
      isAdult: false
    ) {
      ...MediaFields
    }
  }
}

fragment MediaFields on Media {
  id
  idMal
  title {
    romaji
    english
    native
    userPreferred
  }
  type
  format
  status
  description
  startDate { year month day }
  endDate { year month day }
  season
  seasonYear
  seasonInt
  episodes
  duration
  chapters
  volumes
  countryOfOrigin
  isLicensed
  source
  hashtag
  trailer {
    id
    site
    thumbnail
  }
  updatedAt
  coverImage {
    extraLarge
    large
    medium
    color
  }
  bannerImage
  genres
  synonyms
  averageScore
  meanScore
  popularity
  isLocked
  trending
  favourites
  tags {
    id
    name
    rank
    isGeneralSpoiler
    isMediaSpoiler
    isAdult
    category
  }
  relations {
    edges {
      relationType
      id
    }
    nodes {
      id
      title {
        romaji
        english
        native
      }
    }
  }
  characters {
    edges {
      role
    }
    nodes {
      id
      name { full native }
      gender
    }
  }
  staff {
    edges { role }
    nodes {
      id
      name { full native }
      primaryOccupations
    }
  }
  studios {
    edges { isMain }
    nodes {
      id
      name
      siteUrl
    }
  }
  isFavourite
  isFavouriteBlocked
  isAdult
  nextAiringEpisode {
    id
    airingAt
    episode
  }
  airingSchedule {
    edges {
      id
      node {
        id
        airingAt
        episode
      }
    }
  }
  trends {
    nodes {
      date
      trending
      averageScore
      popularity
    }
  }
  externalLinks {
    url
    site
    type
  }
  streamingEpisodes {
    title
    thumbnail
    url
    site
  }
  rankings {
    id
    rank
    type
    year
    season
    format
    context
  }
  mediaListEntry {
    id
    status
    score
  }
  reviews {
    nodes {
      id
      summary
      score
    }
  }
  recommendations {
    nodes {
      id
      rating
    }
  }
  stats {
    scoreDistribution { score amount }
    statusDistribution { status amount }
  }
  siteUrl
  autoCreateForumThread
  isRecommendationBlocked
  isReviewBlocked
  modNotes
}
"""

def dump_test_5_json(output_path="anilist_test_5.json"):
    variables = {"page": 1, "perPage": 5}

    resp = requests.post(
        ENDPOINT,
        json={"query": QUERY, "variables": variables},
        headers={"Content-Type": "application/json", "Accept": "application/json"},
    )

    if resp.status_code != 200:
        print("STATUS:", resp.status_code)
        print("ERROR:", resp.text)
        return

    data = resp.json()
    media_list = data["data"]["Page"]["media"]

    Path(output_path).write_text(
        json.dumps(media_list, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )

    print(f"Saved test dataset → {output_path}")


if __name__ == "__main__":
    dump_test_5_json()

Saved test dataset → anilist_test_5.json


In [5]:
import json
import pandas as pd

pd.set_option("display.max_columns", None)

# 업로드한 JSON 파일 경로
path = "anilist_test_5.json"

# JSON 불러오기
with open(path, "r") as f:
    data = json.load(f)

# data = 리스트 형태 (5개 작품)
rows = []

for m in data:
    row = {
        "id": m.get("id"),
        "idMal": m.get("idMal"),
        "title_romaji": m.get("title", {}).get("romaji"),
        "title_english": m.get("title", {}).get("english"),
        "title_native": m.get("title", {}).get("native"),
        "type": m.get("type"),
        "format": m.get("format"),
        "status": m.get("status"),
        "episodes": m.get("episodes"),
        "duration": m.get("duration"),
        "source": m.get("source"),
        "season": m.get("season"),
        "seasonYear": m.get("seasonYear"),
        "averageScore": m.get("averageScore"),
        "meanScore": m.get("meanScore"),
        "popularity": m.get("popularity"),
        "favourites": m.get("favourites"),
        
        # 날짜 필드 분리
        "start_year": m.get("startDate", {}).get("year"),
        "start_month": m.get("startDate", {}).get("month"),
        "start_day": m.get("startDate", {}).get("day"),
        "end_year": m.get("endDate", {}).get("year"),
        "end_month": m.get("endDate", {}).get("month"),
        "end_day": m.get("endDate", {}).get("day"),
        
        # 리스트 필드들
        "genres": ", ".join(m.get("genres", []) or []),
        "synonyms": ", ".join(m.get("synonyms", []) or []),
        "tags": ", ".join([t["name"] for t in m.get("tags", [])]),
        
        # 스튜디오
        "studios": ", ".join([s["name"] for s in m.get("studios", {}).get("nodes", [])]),
    }
    rows.append(row)

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,id,idMal,title_romaji,title_english,title_native,type,format,status,episodes,duration,source,season,seasonYear,averageScore,meanScore,popularity,favourites,start_year,start_month,start_day,end_year,end_month,end_day,genres,synonyms,tags,studios
0,16498,16498,Shingeki no Kyojin,Attack on Titan,進撃の巨人,ANIME,TV,FINISHED,25,24,MANGA,SPRING,2013,85,85,922903,55725,2013,4,7,2013,9,28,"Action, Drama, Fantasy, Mystery","SnK, AoT, Ataque a los Titanes, Ataque dos Tit...","Kaiju, Revenge, Military, Tragedy, Post-Apocal...","WIT STUDIO, Pony Canyon, Kodansha, Production ..."
1,101922,38000,Kimetsu no Yaiba,Demon Slayer: Kimetsu no Yaiba,鬼滅の刃,ANIME,TV,FINISHED,26,24,MANGA,SPRING,2019,82,82,876638,42035,2019,4,6,2019,9,28,"Action, Adventure, Drama, Fantasy, Supernatural","KnY, Kimetsu no Yaiba: Kyoudai no Kizuna, Demo...","Demons, Shounen, Swordplay, Male Protagonist, ...","ufotable, Aniplex, Shueisha, Aniplex of Americ..."
2,1535,1535,DEATH NOTE,Death Note,DEATH NOTE,ANIME,TV,FINISHED,37,23,MANGA,FALL,2006,84,84,840282,44144,2006,10,4,2007,6,27,"Mystery, Psychological, Supernatural, Thriller","デスノート, 死亡笔记, מחברת המוות, Notatnik śmierci, Ca...","Crime, Detective, Anti-Hero, Male Protagonist,...","MADHOUSE, VAP, Viz Media, Nippon Television Ne..."
3,113415,40748,Jujutsu Kaisen,JUJUTSU KAISEN,呪術廻戦,ANIME,TV,FINISHED,24,24,MANGA,FALL,2020,84,84,831217,49275,2020,10,3,2021,3,27,"Action, Drama, Supernatural","JJK, Sorcery Fight, 咒术回战, 주술회전, มหาเวทย์ผนึกมา...","Urban Fantasy, Shounen, Youkai, Super Power, C...","Toho, MAPPA, Shueisha, Sumzap, Mainichi Broadc..."
4,21459,31964,Boku no Hero Academia,My Hero Academia,僕のヒーローアカデミア,ANIME,TV,FINISHED,13,24,MANGA,SPRING,2016,76,76,777465,20687,2016,4,3,2016,6,26,"Action, Adventure, Comedy","BNHA, MHA, 나의 히어로 아카데미아 1기, 나히아 1기, אקדמיית הג...","Super Power, Superhero, Shounen, Primarily Tee...","bones, Dentsu, Mainichi Broadcasting System, T..."


In [1]:
import requests

QUERY = """
query {
  Page(page: 1, perPage: 1) {
    pageInfo {
      total
    }
    media(type: ANIME, isAdult: false) {
      id
    }
  }
}
"""

resp = requests.post(
    "https://graphql.anilist.co",
    json={"query": QUERY},
    headers={"Content-Type": "application/json"}
)

print(resp.json())

{'data': {'Page': {'pageInfo': {'total': 20040}, 'media': [{'id': 1}]}}}


In [2]:
import json
import time
import random
import requests

ENDPOINT = "https://graphql.anilist.co"

# 상위 10,000개용 경량 쿼리 (studios, staff 포함)
QUERY = """
query ($page: Int!, $perPage: Int!) {
  Page(page: $page, perPage: $perPage) {
    pageInfo {
      total
      hasNextPage
    }
    media(
      type: ANIME
      isAdult: false
      sort: POPULARITY_DESC
      format_not_in: [MUSIC]
      status_not_in: [CANCELLED]
    ) {
      id
      title { romaji english native }
      format
      status
      season
      seasonYear
      episodes
      duration
      averageScore
      meanScore
      popularity
      favourites
      genres
      tags { name }

      studios {
        nodes { name }
        edges { isMain }
      }
      staff {
        nodes { name { full } primaryOccupations }
        edges { role }
      }
    }
  }
}
"""


def fetch_page(page: int, per_page: int, max_retries: int = 5) -> dict:
    """
    AniList Page 한 페이지를 가져오는 함수.
    - 429 Too Many Requests 발생 시 Retry-After 헤더를 참고해 대기 후 재시도
    - 그 외 네트워크 에러는 지수적 대기로 몇 번 재시도 후 실패
    """
    for attempt in range(max_retries):
        try:
            resp = requests.post(
                ENDPOINT,
                json={"query": QUERY, "variables": {"page": page, "perPage": per_page}},
                headers={"Content-Type": "application/json", "Accept": "application/json"},
                timeout=30,
            )

            # 정상 응답
            if resp.status_code == 200:
                data = resp.json()
                if "data" not in data:
                    raise RuntimeError(f"Unexpected response format: {data}")
                return data["data"]["Page"]

            # 레이트 리밋 초과
            if resp.status_code == 429:
                retry_after = resp.headers.get("Retry-After")
                if retry_after is not None:
                    wait_time = float(retry_after)
                else:
                    wait_time = 4.0
                wait_time += random.uniform(0.1, 0.5)
                print(f"[429] Too Many Requests, {wait_time:.1f}초 대기 후 재시도...")
                time.sleep(wait_time)
                continue

            # 그 외 HTTP 에러
            print(f"[HTTP {resp.status_code}] 응답 에러:")
            print(resp.text)
            resp.raise_for_status()

        except requests.exceptions.RequestException as e:
            # 네트워크 계열 예외: 약간씩 대기 시간 늘려가며 재시도
            backoff = 2 + attempt * 1.5
            print(f"[오류] 요청 중 예외 발생 (시도 {attempt + 1}/{max_retries}): {e}")
            print(f"{backoff:.1f}초 대기 후 재시도...")
            time.sleep(backoff)

    raise RuntimeError("최대 재시도 횟수 초과로 실패했습니다.")


def fetch_top_json(out_path: str = "anilist_top10000.json") -> None:
    per_page = 50
    max_items = 10000
    items: list[dict] = []

    page = 1

    while len(items) < max_items:
        print(f"Fetching page {page} (현재 수집 개수: {len(items)}) ...")

        page_data = fetch_page(page, per_page)
        media_list = page_data.get("media") or []
        items.extend(media_list)

        page_info = page_data.get("pageInfo") or {}
        has_next = page_info.get("hasNextPage", False)

        if not has_next:
            print("더 가져올 페이지가 없어 중단합니다.")
            break

        page += 1
        # 기본적인 템포 제한 (429 대비용, 너무 자주 안 때리게)
        time.sleep(0.8)

        if len(items) >= max_items:
            break

    # 10,000개로 자르기
    items = items[:max_items]
    print(f"총 {len(items)}개 작품 수집 완료. 파일로 저장합니다...")

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(items, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(items)} items to {out_path}")


if __name__ == "__main__":
    fetch_top_json()

Fetching page 1 (현재 수집 개수: 0) ...
Fetching page 2 (현재 수집 개수: 50) ...
Fetching page 3 (현재 수집 개수: 100) ...
Fetching page 4 (현재 수집 개수: 150) ...
Fetching page 5 (현재 수집 개수: 200) ...
Fetching page 6 (현재 수집 개수: 250) ...
Fetching page 7 (현재 수집 개수: 300) ...
Fetching page 8 (현재 수집 개수: 350) ...
Fetching page 9 (현재 수집 개수: 400) ...
Fetching page 10 (현재 수집 개수: 450) ...
Fetching page 11 (현재 수집 개수: 500) ...
Fetching page 12 (현재 수집 개수: 550) ...
Fetching page 13 (현재 수집 개수: 600) ...
Fetching page 14 (현재 수집 개수: 650) ...
Fetching page 15 (현재 수집 개수: 700) ...
Fetching page 16 (현재 수집 개수: 750) ...
Fetching page 17 (현재 수집 개수: 800) ...
Fetching page 18 (현재 수집 개수: 850) ...
Fetching page 19 (현재 수집 개수: 900) ...
Fetching page 20 (현재 수집 개수: 950) ...
Fetching page 21 (현재 수집 개수: 1000) ...
Fetching page 22 (현재 수집 개수: 1050) ...
Fetching page 23 (현재 수집 개수: 1100) ...
Fetching page 24 (현재 수집 개수: 1150) ...
Fetching page 25 (현재 수집 개수: 1200) ...
Fetching page 26 (현재 수집 개수: 1250) ...
Fetching page 27 (현재 수집 개수: 1300) ...
Fetchi