In [1]:
# Setup and imports
# This cell installs small deps (if needed), imports libraries and sets display options.
!pip install pandas pyarrow duckdb matplotlib seaborn
import pandas as pd
import numpy as np
import duckdb
import matplotlib.pyplot as plt
import warnings
from pathlib import Path
import os

# Suppress common warnings for cleaner notebook output
warnings.filterwarnings("ignore")

# Display options for notebooks
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

# Data directory
DATA_DIR = Path("..") / "data" / "Polymarket"
print(f"DATA_DIR: {DATA_DIR}")
print('Files present:', sorted([p.name for p in DATA_DIR.iterdir() if p.is_file()]))

Collecting duckdb
  Obtaining dependency information for duckdb from https://files.pythonhosted.org/packages/d3/f0/cf4241a040ec4f571859a738007ec773b642fbc27df4cbcf34b0c32ea559/duckdb-1.4.4-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading duckdb-1.4.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.3 kB)
Downloading duckdb-1.4.4-cp311-cp311-macosx_11_0_arm64.whl (13.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.4.4
DATA_DIR: ../data/Polymarket
Files present: ['polymarket_soccer_analytics_schema.md', 'soccer_event_stats.parquet', 'soccer_markets.parquet', 'soccer_odds_history.parquet', 'soccer_summary.parquet', 'soccer_tokens.parquet', 'soccer_trades.parquet']


In [2]:
# --- Load ---
markets = pd.read_parquet(os.path.join(DATA_DIR, "soccer_markets.parquet"))
tokens  = pd.read_parquet(os.path.join(DATA_DIR, "soccer_tokens.parquet"))
trades  = pd.read_parquet(os.path.join(DATA_DIR, "soccer_trades.parquet"))
odds    = pd.read_parquet(os.path.join(DATA_DIR, "soccer_odds_history.parquet"))
events  = pd.read_parquet(os.path.join(DATA_DIR, "soccer_event_stats.parquet"))
summary = pd.read_parquet(os.path.join(DATA_DIR, "soccer_summary.parquet"))

print("Loaded shapes:")
print("markets:", markets.shape)
print("tokens :", tokens.shape)
print("trades :", trades.shape)
print("odds   :", odds.shape)
print("events :", events.shape)
print("summary:", summary.shape)

Loaded shapes:
markets: (8549, 10)
tokens : (17096, 3)
trades : (1138914, 9)
odds   : (666837, 4)
events : (2640, 5)
summary: (8549, 9)


In [3]:
# --- small helpers ---
def hr():
    print("-" * 80)

def report(name, bad_df=None, show=3):
    if bad_df is None:
        print(f"{name:<45} OK")
        return
    n = len(bad_df)
    if n == 0:
        print(f"{name:<45} OK")
    else:
        print(f"{name:<45} FAIL  ({n:,})")
        display(bad_df.head(show))

hr()
print("SANITY CHECKS")
hr()

# --- uniqueness quick look ---
print("rows / uniques")
print("markets:", f"{len(markets):,}", "unique market_id:", f"{markets['market_id'].nunique():,}")
print("tokens :", f"{len(tokens):,}",  "unique token_id :", f"{tokens['token_id'].nunique():,}")
print("events :", f"{len(events):,}",  "unique event_slug:", f"{events['event_slug'].nunique():,}")
hr()

# --- tokens -> markets ---
tokens_without_market = tokens.loc[~tokens["market_id"].isin(markets["market_id"])]
report("tokens reference missing markets", tokens_without_market)

markets_without_tokens = markets.loc[~markets["market_id"].isin(tokens["market_id"])][
    ["market_id", "question", "event_slug", "volume", "created_at", "end_date"]
]
report("markets missing tokens", markets_without_tokens)

hr()

# --- trades -> markets/tokens ---
trades_bad_market = trades.loc[~trades["market_id"].isin(markets["market_id"])]
report("trades reference missing markets", trades_bad_market)

trades_bad_token = trades.loc[~trades["token_id"].isin(tokens["token_id"])]
report("trades reference missing tokens", trades_bad_token)

# pair integrity (market_id, token_id) should exist in tokens table
valid_pairs = set(zip(tokens["market_id"], tokens["token_id"]))
trade_pairs = set(zip(trades["market_id"], trades["token_id"]))
bad_trade_pairs = list(trade_pairs - valid_pairs)

if len(bad_trade_pairs) == 0:
    print(f"{'trades have valid (market_id, token_id) pairs':<45} OK")
else:
    print(f"{'trades have valid (market_id, token_id) pairs':<45} FAIL  ({len(bad_trade_pairs):,})")
    display(pd.DataFrame(bad_trade_pairs, columns=["market_id", "token_id"]).head(10))

hr()

# --- odds -> markets/tokens ---
odds_bad_market = odds.loc[~odds["market_id"].isin(markets["market_id"])]
report("odds reference missing markets", odds_bad_market)

odds_bad_token = odds.loc[~odds["token_id"].isin(tokens["token_id"])]
report("odds reference missing tokens", odds_bad_token)

odds_pairs = set(zip(odds["market_id"], odds["token_id"]))
bad_odds_pairs = list(odds_pairs - valid_pairs)

if len(bad_odds_pairs) == 0:
    print(f"{'odds have valid (market_id, token_id) pairs':<45} OK")
else:
    print(f"{'odds have valid (market_id, token_id) pairs':<45} FAIL  ({len(bad_odds_pairs):,})")
    display(pd.DataFrame(bad_odds_pairs, columns=["market_id", "token_id"]).head(10))

hr()

# --- summary -> markets ---
summary_missing_market = summary.loc[~summary["market_id"].isin(markets["market_id"])]
report("summary references missing markets", summary_missing_market)

markets_missing_summary = markets.loc[~markets["market_id"].isin(summary["market_id"])][
    ["market_id", "question", "event_slug", "volume", "created_at", "end_date"]
]
report("markets missing summary rows", markets_missing_summary)

hr()

# --- events -> markets ---
events_without_markets = events.loc[~events["event_slug"].isin(markets["event_slug"])]
report("events not found in markets", events_without_markets)

# --- token_count check (summary vs actual tokens) ---
token_counts = tokens.groupby("market_id").size().reset_index(name="actual_token_count")
token_check = summary.merge(token_counts, on="market_id", how="left")

token_mismatch = token_check.loc[token_check["token_count"] != token_check["actual_token_count"]][
    ["market_id", "token_count", "actual_token_count"]
]
report("summary.token_count matches tokens table", token_mismatch)

hr()

--------------------------------------------------------------------------------
SANITY CHECKS
--------------------------------------------------------------------------------
rows / uniques
markets: 8,549 unique market_id: 8,549
tokens : 17,096 unique token_id : 17,096
events : 2,640 unique event_slug: 2,640
--------------------------------------------------------------------------------
tokens reference missing markets              OK
markets missing tokens                        FAIL  (2)


Unnamed: 0,market_id,question,event_slug,volume,created_at,end_date
8444,213639,Will Spain be the Group E winner in the Euro 2...,will-spain-be-the-group-e-winner-in-the-euro-2020,9859.61,2021-06-08 19:15:01,2021-07-01
8497,238813,Will PSG or Manchester City win their Champion...,will-psg-or-manchester-city-win-their-champion...,1127.1,2021-09-21 16:24:43,2021-09-28


--------------------------------------------------------------------------------
trades reference missing markets              OK
trades reference missing tokens               OK
trades have valid (market_id, token_id) pairs OK
--------------------------------------------------------------------------------
odds reference missing markets                OK
odds reference missing tokens                 OK
odds have valid (market_id, token_id) pairs   OK
--------------------------------------------------------------------------------
summary references missing markets            OK
markets missing summary rows                  OK
--------------------------------------------------------------------------------
events not found in markets                   OK
summary.token_count matches tokens table      FAIL  (2)


Unnamed: 0,market_id,token_count,actual_token_count
882,213639,0,
5729,238813,0,


--------------------------------------------------------------------------------


In [4]:
markets.head(3)

Unnamed: 0,market_id,question,slug,event_slug,category,volume,active,closed,created_at,end_date
0,242920,Will Ukraine qualify for the 2022 FIFA World Cup?,will-ukraine-qualify-to-the-2022-fifa-world-cup,will-ukraine-qualify-to-the-2022-fifa-world-cup,Sports,4766.88,True,True,2022-04-06 07:51:48,2022-06-30
1,244963,UEFA Europa League final: Who will win Eintrac...,uefa-europa-league-final-who-will-win-eintrach...,uefa-europa-league-final-who-will-win-eintrach...,Sports,1543.29,True,True,2022-05-18 14:16:53,2022-05-18
2,246443,Soccer: Who will win the United States vs. Uru...,soccer-who-will-win-the-united-states-vs-urugu...,soccer-who-will-win-the-united-states-vs-urugu...,Sports,1363.07,True,True,2022-06-05 12:45:16,2022-06-05


In [5]:
# ---- basic cleanup ----
markets["created_at"] = pd.to_datetime(markets["created_at"], errors="coerce")
markets["end_date"]   = pd.to_datetime(markets["end_date"],   errors="coerce")

# ---- small helper ----
def bar(x, max_x, width=25):
    if pd.isna(x) or max_x == 0:
        return ""
    n = int((x / max_x) * width)
    return "█" * n

# ---- overview ----
hr()
print("MARKETS EDA")
hr()
print("rows:", len(markets))
print("unique market_id:", markets["market_id"].nunique())
print("unique event_slug:", markets["event_slug"].nunique())
print("created_at:", markets["created_at"].min(), "->", markets["created_at"].max())
print("end_date  :", markets["end_date"].min(),   "->", markets["end_date"].max())

# ---- markets per month ----
markets["created_month"] = markets["created_at"].dt.to_period("M")
markets_per_month = (
    markets.dropna(subset=["created_month"])
    .groupby("created_month")
    .size()
    .reset_index(name="market_count")
    .sort_values("created_month")
)

last24 = markets_per_month.tail(24).copy()
max_c = last24["market_count"].max()

hr()
print("Markets per month (last 24 months))")
hr()
for _, r in last24.iterrows():
    m = str(r["created_month"])
    c = int(r["market_count"])
    print(f"{m}  {c:>5}  {bar(c, max_c, width=30)}")

# ---- markets per event ----
markets_per_event = (
    markets.groupby("event_slug")
    .size()
    .reset_index(name="market_count")
    .sort_values("market_count", ascending=False)
)

hr()
print("Markets per event (top 10)")
hr()
display(markets_per_event.head(10))

# ---- event lifetime (first market -> last end_date) ----
event_lifetime = (
    markets.groupby("event_slug")
    .agg(
        first_market=("created_at", "min"),
        last_market=("end_date", "max"),
        market_count=("market_id", "count"),
        total_volume=("volume", "sum"),
    )
    .reset_index()
)

event_lifetime["duration_days"] = (event_lifetime["last_market"] - event_lifetime["first_market"]).dt.days

hr()
print("Longest event lifetimes (top 10)")
hr()
display(event_lifetime.sort_values("duration_days", ascending=False).head(10))

# ---- top markets by volume ----
hr()
print("Top markets by volume (top 15)")
hr()
display(
    markets.sort_values("volume", ascending=False)[
        ["market_id", "question", "event_slug", "volume", "created_at", "end_date"]
    ].head(15)
)

# ---- volume by event ----
volume_by_event = (
    markets.groupby("event_slug")["volume"]
    .sum()
    .reset_index(name="total_volume")
    .sort_values("total_volume", ascending=False)
)

top15 = volume_by_event.head(15).copy()
max_v = top15["total_volume"].max()

hr()
print("Volume by event (top 15)")
hr()
for _, r in top15.iterrows():
    slug = str(r["event_slug"])
    v = float(r["total_volume"])
    print(f"{slug[:55]:55}  {v:>12,.0f}  {bar(v, max_v, width=30)}")


--------------------------------------------------------------------------------
MARKETS EDA
--------------------------------------------------------------------------------
rows: 8549
unique market_id: 8549
unique event_slug: 2640
created_at: 2021-04-12 19:50:01 -> 2025-12-09 15:31:51
end_date  : 2021-04-13 00:00:00 -> 2026-07-20 00:00:00
--------------------------------------------------------------------------------
Markets per month (last 24 months))
--------------------------------------------------------------------------------
2024-01     16  
2024-02      3  
2024-03      4  
2024-04     17  
2024-05      2  
2024-06     42  
2024-07     30  
2024-08    161  ██
2024-09    317  █████
2024-10    210  ███
2024-11    209  ███
2024-12    272  ████
2025-01    353  ██████
2025-02    304  █████
2025-03    217  ███
2025-04    226  ████
2025-05    242  ████
2025-06    570  ██████████
2025-07    549  ██████████
2025-08   1037  ██████████████████
2025-09    469  ████████
2025-10    812  ██

Unnamed: 0,event_slug,market_count
2519,which-soccer-players-will-sign-with-new-clubs,64
1020,fantasy-football-wr-points-leader-week-1,52
1014,fantasy-football-rb-points-leader-week-1,46
8,2026-fifa-world-cup-winner-595,43
2159,uefa-europa-league-winner,43
2150,uefa-champions-league-winner,39
1059,fifa-club-world-cup-golden-ball-winner,38
1103,fifa-club-world-cup-top-goalscorer,37
1484,ligue-1-top-goalscorer,37
264,champions-league-winner-2025,36


--------------------------------------------------------------------------------
Longest event lifetimes (top 10)
--------------------------------------------------------------------------------


Unnamed: 0,event_slug,first_market,last_market,market_count,total_volume,duration_days
8,2026-fifa-world-cup-winner-595,2025-07-02 16:54:40,2026-07-20 00:00:00,43,14388870.0,382.0
2543,will-bayern-munich-win-the-bundesliga-2023-24,2024-02-18 20:22:01,2024-12-31 12:00:00,1,11958.58,316.0
2150,uefa-champions-league-winner,2025-07-21 20:58:38,2026-05-31 00:00:00,39,107240500.0,313.0
1462,la-liga-winner-114,2025-07-21 21:26:38,2026-05-30 00:00:00,20,39270360.0,312.0
1378,french-ligue-1-winner,2025-07-22 23:40:48,2026-05-30 00:00:00,18,850572.5,311.0
1730,serie-a-league-winner,2025-07-22 14:29:38,2026-05-28 00:00:00,20,2102150.0,309.0
78,bundesliga-winner-527,2025-07-22 14:40:55,2026-05-28 00:00:00,18,1127435.0,309.0
314,english-premier-league-winner,2025-07-21 21:12:58,2026-05-27 00:00:00,20,129223700.0,309.0
7,2026-fifa-world-cup-which-countries-qualify,2025-06-08 02:11:19,2026-04-12 00:00:00,32,123083.9,307.0
2538,will-any-2026-fifa-world-cup-game-scheduled-in...,2025-08-18 23:07:01,2026-06-10 00:00:00,1,93.40543,295.0


--------------------------------------------------------------------------------
Top markets by volume (top 15)
--------------------------------------------------------------------------------


Unnamed: 0,market_id,question,event_slug,volume,created_at,end_date
631,507286,Will Aston Villa win the UEFA Champions League?,champions-league-winner-2025,133113800.0,2024-09-17 17:21:16,2025-05-31 12:00:00
155,506742,Nottingham Forest wins the Premier League?,premier-league-winner-24-25,101429200.0,2024-09-09 19:33:14,2025-05-25 12:00:00
156,506743,Southampton wins the Premier League?,premier-league-winner-24-25,88209350.0,2024-09-09 19:33:35,2025-05-25 12:00:00
645,507300,Will Inter Milan win the UEFA Champions League?,champions-league-winner-2025,82978660.0,2024-09-17 17:27:16,2025-05-31 12:00:00
678,507401,Will Real Betis win La Liga?,la-liga-winner,71762810.0,2024-09-18 02:39:50,2025-05-25 12:00:00
153,506740,Manchester United wins the Premier League?,premier-league-winner-24-25,71063870.0,2024-09-09 19:32:26,2025-05-25 12:00:00
655,507310,Will Red Star Belgrade win the UEFA Champions ...,champions-league-winner-2025,69281810.0,2024-09-17 17:35:44,2025-05-31 12:00:00
146,506733,Brighton & Hove Albion wins the Premier League?,premier-league-winner-24-25,67314790.0,2024-09-09 19:27:55,2025-05-25 12:00:00
650,507305,Will Paris Saint-Germain win the UEFA Champion...,champions-league-winner-2025,51320580.0,2024-09-17 17:30:16,2025-05-31 12:00:00
149,506736,Everton wins the Premier League?,premier-league-winner-24-25,47715610.0,2024-09-09 19:29:10,2025-05-25 12:00:00


--------------------------------------------------------------------------------
Volume by event (top 15)
--------------------------------------------------------------------------------
champions-league-winner-2025                             1,001,676,674  ██████████████████████████████
premier-league-winner-24-25                               808,665,619  ████████████████████████
la-liga-winner                                            277,840,555  ████████
english-premier-league-winner                             129,223,700  ███
uefa-champions-league-winner                              107,240,504  ███
la-liga-winner-114                                         39,270,361  █
serie-a-winner                                             36,006,691  █
fantasy-football-top-flex-2024                             24,466,653  
europa-league-winner-24-25                                 17,752,862  
fantasy-football-top-qb-2024                               16,692,639  
fifa-club-world-cup-wi