<a href="https://colab.research.google.com/github/sh1nysparkly/relevance-validation/blob/main/IA_Visualizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json
import spacy
from collections import Counter
from google.colab import files


üìÅ Upload your CSV file:


Saving Opt Guide Export Pivot Style.csv to Opt Guide Export Pivot Style.csv
‚úÖ Loaded: Opt Guide Export Pivot Style.csv

Loading spaCy model...
Loading data from Opt Guide Export Pivot Style.csv...

Processing 140 unique pages...
Extracting entities using NLP...
  Processed 20/140 pages...
  Processed 40/140 pages...
  Processed 60/140 pages...
  Processed 80/140 pages...
  Processed 100/140 pages...
  Processed 120/140 pages...
  Processed 140/140 pages...

‚úÖ Entity extraction complete!
   Average entities per page: 2.3
   Pages with entities: 116/140


KeyError: 'keywordCount'

In [None]:
print("üìÅ Upload your CSV file:")
uploaded = files.upload()

# Get the uploaded filename
csv_filename = list(uploaded.keys())[0]
print(f"‚úÖ Loaded: {csv_filename}")

üìÅ Upload your CSV file:


Saving Opt Guide Export Pivot Style.csv to Opt Guide Export Pivot Style (3).csv
‚úÖ Loaded: Opt Guide Export Pivot Style (3).csv


In [None]:
print("\nLoading spaCy model...")
# Install if needed: !python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

print(f"Loading data from {csv_filename}...")
df = pd.read_csv(csv_filename)

# Group by page to get unique pages
pages = df.groupby('URL Slug').agg({
    'Page Name': 'first',
    'Section': 'first',
    'Content Type': 'first',
    'Journey Stage': 'first'
}).reset_index()

# Get primary keywords per page (for the JSON output)
primary_kws = df[df['Priority'] == 'Primary'].groupby('URL Slug').agg({
    'Keyword': lambda x: list(x)
}).reset_index()
primary_kws.columns = ['URL Slug', 'Primary Keywords']

# Get ALL keywords for entity extraction
all_kws = df.groupby('URL Slug').agg({
    'Keyword': lambda x: ' '.join([str(k) for k in x if pd.notna(k)])
}).reset_index()
all_kws.columns = ['URL Slug', 'All Keywords Text']

# Merge everything
pages = pages.merge(primary_kws, on='URL Slug', how='left')
pages = pages.merge(all_kws, on='URL Slug', how='left')

# Fill NaN values
pages['Primary Keywords'] = pages['Primary Keywords'].fillna('').apply(lambda x: x if isinstance(x, list) else [])
pages['All Keywords Text'] = pages['All Keywords Text'].fillna('')

print(f"\nProcessing {len(pages)} unique pages...")
print("Extracting entities using NLP...")

def extract_entities(text):
    """Extract named entities from text using spaCy"""
    if pd.isna(text) or text == '':
        return []

    doc = nlp(text)
    entities = []

    for ent in doc.ents:
        # Filter to relevant entity types for travel/SEO
        if ent.label_ in ['GPE',        # Geopolitical entity (countries, cities)
                          'LOC',        # Non-GPE locations
                          'ORG',        # Organizations
                          'PRODUCT',    # Products
                          'EVENT',      # Events
                          'FAC',        # Facilities
                          'NORP']:      # Nationalities, religious/political groups
            entities.append({
                'text': ent.text,
                'type': ent.label_
            })

    # Deduplicate by text (case-insensitive)
    seen = set()
    unique_entities = []
    for ent in entities:
        text_lower = ent['text'].lower()
        if text_lower not in seen:
            seen.add(text_lower)
            unique_entities.append(ent)

    return unique_entities

# Extract entities for each page - better approach
entities_list = []
entity_counts = []
entity_types_list = []

for idx, row in pages.iterrows():
    # Combine page name and keywords for entity extraction
    text = f"{row['Page Name']} {row['All Keywords Text']}"

    entities = extract_entities(text)

    entities_list.append(entities)
    entity_counts.append(len(entities))
    entity_types_list.append(list(set([e['type'] for e in entities])))

    if (idx + 1) % 20 == 0:
        print(f"  Processed {idx + 1}/{len(pages)} pages...")

# Now assign all at once
pages['entities'] = entities_list
pages['entity_count'] = entity_counts
pages['entity_types'] = entity_types_list


Loading spaCy model...
Loading data from Opt Guide Export Pivot Style (3).csv...

Processing 143 unique pages...
Extracting entities using NLP...
  Processed 20/143 pages...
  Processed 40/143 pages...
  Processed 60/143 pages...
  Processed 80/143 pages...
  Processed 100/143 pages...
  Processed 120/143 pages...
  Processed 140/143 pages...


In [None]:
# Add keywordCount column (needed for hierarchy building)
pages['keywordCount'] = pages['Primary Keywords'].apply(len)

print(f"\n‚úÖ Entity extraction complete!")
print(f"   Average entities per page: {pages['entity_count'].mean():.1f}")
print(f"   Pages with entities: {(pages['entity_count'] > 0).sum()}/{len(pages)}")

# Build hierarchy with both keywords AND entities
def build_hierarchy(pages_df):
    root = {
        "title": "AMA Travel",
        "slug": "/",
        "entityDensity": 0,
        "keywordCount": 0,
        "children": []
    }

    node_map = {"/": root}

    # Sort by depth (pages with no parent or "/" parent come first)
    pages_df['depth'] = pages_df['URL Slug'].str.count('/')
    pages_sorted = pages_df.sort_values('depth')

    for _, page in pages_sorted.iterrows():
        slug = page['URL Slug']

        # Use the Parent column if available, otherwise parse from slug
        if 'Parent' in page and pd.notna(page['Parent']) and page['Parent'] != '':
            parent_slug = page['Parent']
        else:
            # Fallback to parsing from URL slug
            parts = [p for p in slug.split('/') if p]
            parent_slug = '/' if len(parts) <= 1 else '/' + '/'.join(parts[:-1])

        # Get entities list
        entities_list = page['entities'] if isinstance(page['entities'], list) else []
        entity_types = page['entity_types'] if isinstance(page['entity_types'], list) else []
        primary_kws = page['Primary Keywords'] if isinstance(page['Primary Keywords'], list) else []

        # Create node with BOTH entities and keywords
        node = {
            "title": page['Page Name'],
            "slug": slug,
            "section": page['Section'],
            "contentType": page['Content Type'],
            "primaryKeywords": primary_kws[:5],  # Top 5 for readability
            "keywordCount": len(primary_kws),
            "entities": [e['text'] for e in entities_list],  # Just the entity names
            "entityTypes": entity_types,
            "entityDensity": int(page['entity_count']) if pd.notna(page['entity_count']) else 0,
            "children": []
        }

        # Add to parent
        if parent_slug in node_map:
            node_map[parent_slug]['children'].append(node)
        else:
            root['children'].append(node)

        node_map[slug] = node

    return root

hierarchy = build_hierarchy(pages)


‚úÖ Entity extraction complete!
   Average entities per page: 2.3
   Pages with entities: 118/143


In [None]:
# Calculate branch-level metrics
def calc_branch_metrics(node):
    """Calculate cumulative entity and keyword metrics for each branch"""
    if not node['children']:
        return {
            'entities': node.get('entityDensity', 0),
            'keywords': node.get('keywordCount', 0)
        }

    entity_total = node.get('entityDensity', 0)
    keyword_total = node.get('keywordCount', 0)

    for child in node['children']:
        child_metrics = calc_branch_metrics(child)
        entity_total += child_metrics['entities']
        keyword_total += child_metrics['keywords']

    node['branchEntityDensity'] = entity_total
    node['branchKeywordCount'] = keyword_total

    return {
        'entities': entity_total,
        'keywords': keyword_total
    }

calc_branch_metrics(hierarchy)

# Generate summary statistics
all_entity_types = set()
entity_type_counts = Counter()
for _, page in pages.iterrows():
    if isinstance(page['entity_types'], list):
        all_entity_types.update(page['entity_types'])
        for etype in page['entity_types']:
            entity_type_counts[etype] += 1

# Add metadata to output
output = {
    "AMA Travel Site Architecture": hierarchy,
    "metadata": {
        "totalPages": len(pages),
        "pagesWithKeywords": int((pages['keywordCount'] > 0).sum()),
        "pagesWithEntities": int((pages['entity_count'] > 0).sum()),
        "avgEntitiesPerPage": float(pages['entity_count'].mean().round(1)),
        "avgKeywordsPerPage": float(pages['Primary Keywords'].apply(len).mean().round(1)),
        "entityTypes": list(all_entity_types),
        "entityTypeBreakdown": dict(entity_type_counts)
    }
}

In [None]:
# Save the JSON
output_filename = 'ama_travel_hierarchy.json'
with open(output_filename, 'w') as f:
    json.dump(output, f, indent=2)

print(f"\nüéØ Created {output_filename}")
print("\n‚¨áÔ∏è Downloading JSON file...")
files.download(output_filename)
print(f"\nüìä Summary Statistics:")
print(f"   Total pages: {len(pages)}")
print(f"   Pages with keywords: {(pages['Primary Keywords'].apply(len) > 0).sum()}")
print(f"   Pages with entities: {(pages['entity_count'] > 0).sum()}")
print(f"   Avg keywords/page: {pages['Primary Keywords'].apply(len).mean():.1f}")
print(f"   Avg entities/page: {pages['entity_count'].mean():.1f}")

print(f"\nüèÜ Top sections by entity coverage:")
for section in sorted(hierarchy['children'],
                      key=lambda x: x.get('branchEntityDensity', 0),
                      reverse=True)[:5]:
    entities = section.get('branchEntityDensity', 0)
    keywords = section.get('branchKeywordCount', 0)
    pages = len(section.get('children', [])) + (1 if section.get('entityDensity', 0) > 0 else 0)
    print(f"   {section['title']:30} {entities:3} entities, {keywords:3} keywords, {pages:2} pages")

print(f"\nüîç Entity types found:")
for etype, count in entity_type_counts.most_common():
    print(f"   {etype:10} {count:3} occurrences")

print("\nüí° Entity type legend:")
print("   GPE     = Geographic/Political entities (countries, cities, states)")
print("   LOC     = Non-political locations (mountains, rivers, regions)")
print("   ORG     = Organizations (companies, agencies)")
print("   PRODUCT = Products and services")
print("   FAC     = Facilities (airports, hotels, landmarks)")
print("   EVENT   = Named events")
print("   NORP    = Nationalities, religious/political groups")

print("\n‚ú® Load ama_travel_hierarchy.json into todiagram.com!")
print("   Each node shows:")
print("   ‚Ä¢ Primary keywords (for search targeting)")
print("   ‚Ä¢ Extracted entities (for semantic/AEO analysis)")
print("   ‚Ä¢ Entity density (unique entities per page)")
print("   ‚Ä¢ Branch metrics (cumulative coverage per section)")


üéØ Created ama_travel_hierarchy.json

‚¨áÔ∏è Downloading JSON file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


üìä Summary Statistics:
   Total pages: 143
   Pages with keywords: 141
   Pages with entities: 118
   Avg keywords/page: 7.6
   Avg entities/page: 2.3

üèÜ Top sections by entity coverage:
   Homepage                       328 entities, 1088 keywords, 24 pages

üîç Entity types found:
   ORG         88 occurrences
   GPE         75 occurrences
   NORP        16 occurrences
   LOC         14 occurrences
   FAC          4 occurrences
   PRODUCT      2 occurrences
   EVENT        1 occurrences

üí° Entity type legend:
   GPE     = Geographic/Political entities (countries, cities, states)
   LOC     = Non-political locations (mountains, rivers, regions)
   ORG     = Organizations (companies, agencies)
   PRODUCT = Products and services
   FAC     = Facilities (airports, hotels, landmarks)
   EVENT   = Named events
   NORP    = Nationalities, religious/political groups

‚ú® Load ama_travel_hierarchy.json into todiagram.com!
   Each node shows:
   ‚Ä¢ Primary keywords (for search targe