# Enrichment Overview and HTML Preview

In [12]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

load_dotenv()
engine = create_engine(os.getenv("POSTGRES_URL"))


## Load Tables

In [None]:
nodes_df = pd.read_sql("SELECT * FROM nodes", engine)
meta_df = pd.read_sql("SELECT * FROM metadata", engine)
df = nodes_df.merge(meta_df, on="node_id", how="left", suffixes=("", "_meta"))

## Metadata Enrichment Stats

In [None]:
total_nodes = len(nodes_df)
total_enriched = len(meta_df)
with_images = meta_df['image_url'].notna().sum()
with_desc = meta_df['description'].notna().sum()
with_full_desc = meta_df['full_description'].notna().sum()
with_page = meta_df['wiki_page_url'].notna().sum()

# Node type breakdown
synth_nodes = nodes_df['ott_id'].isna().sum()
taxon_nodes = total_nodes - synth_nodes
aliased_synth = nodes_df[nodes_df['ott_id'].isna() & nodes_df['display_name'].notna()].shape[0]

print(f"=== Node Breakdown ===")
print(f"Total nodes:       {total_nodes:>10,}")
print(f"  Taxon nodes:     {taxon_nodes:>10,}")
print(f"  Synthetic MRCA:  {synth_nodes:>10,} ({aliased_synth:,} aliased)")
print()
print(f"=== Metadata Coverage ===")
print(f"Enriched:          {total_enriched:>10,} / {total_nodes:,} ({100*total_enriched/total_nodes:.1f}%)")
print(f"  With images:     {with_images:>10,}")
print(f"  With short desc: {with_desc:>10,}")
print(f"  With full desc:  {with_full_desc:>10,}")
print(f"  With wiki page:  {with_page:>10,}")
print()

# Coverage by node importance (num_tips buckets)
for threshold in [1_000_000, 100_000, 10_000, 1_000, 100]:
    big = nodes_df[nodes_df['num_tips'] >= threshold]
    enriched_big = df[(df['num_tips'] >= threshold) & df['common_name'].notna()]
    pct = 100 * len(enriched_big) / len(big) if len(big) > 0 else 0
    print(f"  num_tips >= {threshold:>10,}: {len(enriched_big):,}/{len(big):,} enriched ({pct:.0f}%)")

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: coverage breakdown
counts = {
    "Images": with_images,
    "Short desc": with_desc,
    "Full desc": with_full_desc,
    "Wiki pages": with_page,
    "No metadata": total_nodes - total_enriched
}
colors = ['#2ecc71', '#3498db', '#9b59b6', '#e67e22', '#e74c3c']
axes[0].bar(counts.keys(), counts.values(), color=colors)
axes[0].set_ylabel("Count")
axes[0].set_title("Metadata Coverage by Type")
axes[0].tick_params(axis='x', rotation=30)

# Right: coverage by importance tier
tiers = ['1M+', '100K+', '10K+', '1K+', '100+', 'All']
thresholds = [1_000_000, 100_000, 10_000, 1_000, 100, 0]
pcts = []
for t in thresholds:
    big = nodes_df[nodes_df['num_tips'] >= t] if t > 0 else nodes_df
    enriched_big = df[(df['num_tips'] >= t) & df['common_name'].notna()] if t > 0 else df[df['common_name'].notna()]
    pcts.append(100 * len(enriched_big) / len(big) if len(big) > 0 else 0)

bars = axes[1].bar(tiers, pcts, color='#3498db')
axes[1].set_ylabel("% Enriched")
axes[1].set_title("Enrichment Coverage by Node Importance")
axes[1].set_ylim(0, 105)
for bar, pct in zip(bars, pcts):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                 f'{pct:.0f}%', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## HTML Preview of Enriched Taxa

In [None]:
from IPython.display import display, HTML

# Filter rows that have BOTH image and full description
preview_df = df[
    df["image_url"].notna() &
    df["full_description"].notna()
].copy()

# Wikipedia link icon
preview_df["link"] = preview_df["wiki_page_url"].apply(
    lambda url: f'<a href="{url}" target="_blank">link</a>' if pd.notna(url) else ""
)

# Clickable thumbnail image
preview_df["image"] = preview_df["image_url"].apply(
    lambda url: f'<a href="{url}" target="_blank"><img src="{url}" width="60"></a>' if pd.notna(url) else ""
)

# Columns to display
cols = [
    "image",
    "node_id",
    "name",
    "common_name",
    "rank",
    "enriched_score",
    "full_description",
    "link"
]

html = preview_df[cols].sample(n=min(10, len(preview_df))).to_html(escape=False, index=False)
display(HTML(html))