<a href="https://colab.research.google.com/github/sh1nysparkly/relevance-validation/blob/main/JSON_Output_NLP_and_IA_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json
from google.colab import files
from google.cloud import language_v1
import time

print("üìÅ Upload your CSV file:")
uploaded = files.upload()
csv_filename = list(uploaded.keys())[0]
print(f"‚úÖ Loaded: {csv_filename}")

üìÅ Upload your CSV file:


Saving Opt Guide Export Pivot Style.csv to Opt Guide Export Pivot Style.csv
‚úÖ Loaded: Opt Guide Export Pivot Style.csv


In [None]:
print("\nüîë Upload your Google Cloud service account JSON key:")
key_uploaded = files.upload()
key_filename = list(key_uploaded.keys())[0]
print(f"‚úÖ Loaded credentials: {key_filename}")

# Authenticate with Google Cloud
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_filename


üîë Upload your Google Cloud service account JSON key:


Saving nlp-entity-detection-79a294e928f3.json to nlp-entity-detection-79a294e928f3.json
‚úÖ Loaded credentials: nlp-entity-detection-79a294e928f3.json


In [None]:
# Initialize the client
client = language_v1.LanguageServiceClient()

print(f"\nLoading data from {csv_filename}...")
df = pd.read_csv(csv_filename)

# Group by page to get unique pages
pages = df.groupby('URL Slug').agg({
    'Page Name': 'first',
    'Section': 'first',
    'Content Type': 'first',
    'Journey Stage': 'first',
    'Parent': 'first'  # Include Parent column
}).reset_index()

# Get primary keywords per page
primary_kws = df[df['Priority'] == 'Primary'].groupby('URL Slug').agg({
    'Keyword': lambda x: list(x)
}).reset_index()
primary_kws.columns = ['URL Slug', 'Primary Keywords']

# Get ALL keywords for entity extraction
all_kws = df.groupby('URL Slug').agg({
    'Keyword': lambda x: list([str(k) for k in x if pd.notna(k)])
}).reset_index()
all_kws.columns = ['URL Slug', 'All Keywords List']

# Merge everything
pages = pages.merge(primary_kws, on='URL Slug', how='left')
pages = pages.merge(all_kws, on='URL Slug', how='left')


Loading data from Opt Guide Export Pivot Style.csv...


In [None]:
# Fill NaN values
pages['Primary Keywords'] = pages['Primary Keywords'].fillna('').apply(lambda x: x if isinstance(x, list) else [])
pages['All Keywords List'] = pages['All Keywords List'].fillna('').apply(lambda x: x if isinstance(x, list) else [])

print(f"\nProcessing {len(pages)} unique pages with Google NLP API...")
print("‚ö†Ô∏è  Note: This will make API calls and may take a few minutes\n")

def analyze_with_google_nlp(text, use_all_features=True):
    """
    Analyze text with Google NLP API
    Returns categories with confidence and entities with salience
    """
    if not text or text.strip() == '':
        return {
            'categories': [],
            'entities': [],
            'error': None
        }

    try:
        document = language_v1.Document(
            content=text,
            type_=language_v1.Document.Type.PLAIN_TEXT
        )

        # Request both entity analysis and content classification
        features = {
            'extract_entities': True,
            'classify_text': True
        }

        response = client.annotate_text(
            document=document,
            features=features
        )

        # Extract categories with confidence
        categories = []
        for category in response.categories:
            categories.append({
                'name': category.name,
                'confidence': round(category.confidence, 4)
            })

        # Sort by confidence
        categories.sort(key=lambda x: x['confidence'], reverse=True)

        # Extract entities with salience (top 3)
        entities = []
        for entity in response.entities:
            entities.append({
                'name': entity.name,
                'type': language_v1.Entity.Type(entity.type_).name,
                'salience': round(entity.salience, 4)
            })

        # Sort by salience and take top 3
        entities.sort(key=lambda x: x['salience'], reverse=True)
        entities = entities[:3]

        return {
            'categories': categories,
            'entities': entities,
            'error': None
        }

    except Exception as e:
        return {
            'categories': [],
            'entities': [],
            'error': str(e)
        }

# Process each page
categories_list = []
entities_list = []
top_category_list = []
top_category_confidence_list = []
entity_count_list = []
errors_list = []

for idx, row in pages.iterrows():
    # Combine page name and ALL keywords for better analysis
    keywords_text = ' '.join(row['All Keywords List']) if row['All Keywords List'] else ''
    text = f"{row['Page Name']} {keywords_text}"

    result = analyze_with_google_nlp(text)

    categories_list.append(result['categories'])
    entities_list.append(result['entities'])

    # Extract top category
    if result['categories']:
        top_category_list.append(result['categories'][0]['name'])
        top_category_confidence_list.append(result['categories'][0]['confidence'])
    else:
        top_category_list.append(None)
        top_category_confidence_list.append(None)

    entity_count_list.append(len(result['entities']))
    errors_list.append(result['error'])

    if (idx + 1) % 10 == 0:
        print(f"  Processed {idx + 1}/{len(pages)} pages...")
        time.sleep(0.5)  # Be nice to the API

# Assign to dataframe
pages['categories'] = categories_list
pages['entities'] = entities_list
pages['top_category'] = top_category_list
pages['top_category_confidence'] = top_category_confidence_list
pages['entity_count'] = entity_count_list
pages['nlp_error'] = errors_list

# Add keywordCount
pages['keywordCount'] = pages['Primary Keywords'].apply(len)

print(f"\n‚úÖ Google NLP analysis complete!")
print(f"   Pages analyzed: {len(pages)}")
print(f"   Pages with categories: {(pages['top_category'].notna()).sum()}")
print(f"   Pages with entities: {(pages['entity_count'] > 0).sum()}")
print(f"   Average entities per page: {pages['entity_count'].mean():.1f}")
if pages['nlp_error'].notna().any():
    print(f"   ‚ö†Ô∏è  Pages with errors: {pages['nlp_error'].notna().sum()}")

# Build hierarchy
def build_hierarchy(pages_df):
    # Define which root-level pages to group under "Other"
    OTHER_PAGES = [
        '/escape-planner',
        '/luggage-and-accessories',
        '/best-price-guarantee',
        '/trip-planner',
        '/help-centre',
        '/member-perks',
        '/currency-exchange',
        '/about-us'
    ]

    root = {
        "title": "AMA Travel",
        "slug": "/",
        "entityDensity": 0,
        "keywordCount": 0
    }

    # Create "Other" bucket
    other_bucket = {
        "title": "Other Pages",
        "slug": "/other",
        "section": "Other",
        "contentType": "Utility Pages",
        "primaryKeywords": [],
        "keywordCount": 0,
        "topCategory": None,
        "categoryConfidence": None,
        "allCategories": [],
        "entities": [],
        "entitiesWithSalience": [],
        "entityDensity": 0,
        "children": {}
    }

    node_map = {"/": root, "/other": other_bucket}

    # Sort by depth
    pages_df['depth'] = pages_df['URL Slug'].str.count('/')
    pages_sorted = pages_df.sort_values('depth')

    for _, page in pages_sorted.iterrows():
        slug = page['URL Slug']

        # Use the Parent column if available
        if 'Parent' in page and pd.notna(page['Parent']) and page['Parent'] != '':
            parent_slug = page['Parent']
        else:
            parts = [p for p in slug.split('/') if p]
            parent_slug = '/' if len(parts) <= 1 else '/' + '/'.join(parts[:-1])

        # Get data
        primary_kws = page['Primary Keywords'] if isinstance(page['Primary Keywords'], list) else []
        categories = page['categories'] if isinstance(page['categories'], list) else []
        entities = page['entities'] if isinstance(page['entities'], list) else []

        # Create node
        node = {
            "title": page['Page Name'],
            "slug": slug,
            "section": page['Section'],
            "contentType": page['Content Type'],
            "primaryKeywords": primary_kws[:5],
            "keywordCount": len(primary_kws),
            "childCount": 0,  # Will be calculated after hierarchy is built
            "topCategory": page['top_category'] if pd.notna(page['top_category']) else None,
            "categoryConfidence": float(page['top_category_confidence']) if pd.notna(page['top_category_confidence']) else None,
            "allCategories": categories,
            "entities": [e['name'] for e in entities],
            "entitiesWithSalience": entities,
            "entityDensity": int(page['entity_count'])
        }

        # Create a clean key name for this node
        # Use the slug without leading slash, replace slashes with underscores
        node_key = slug.strip('/').replace('/', '_') if slug != '/' else 'homepage'

        # Check if this is a root-level page that should go in "Other"
        if parent_slug == '/' and slug in OTHER_PAGES:
            parent_slug = '/other'

        # Add to parent
        if parent_slug in node_map:
            parent_node = node_map[parent_slug]
            # Initialize children dict if it doesn't exist
            if 'children' not in parent_node or not isinstance(parent_node.get('children'), dict):
                parent_node['children'] = {}
            parent_node['children'][node_key] = node

        node_map[slug] = node

    # Add "Other" bucket to root if it has children
    if other_bucket.get('children'):
        root['children'] = root.get('children', {})
        root['children']['other'] = other_bucket

    return root

hierarchy = build_hierarchy(pages)

# Add child counts to all nodes
def add_child_counts(node):
    """Recursively add childCount to all nodes"""
    children = node.get('children', {})
    if isinstance(children, dict):
        node['childCount'] = len(children)
        for child in children.values():
            add_child_counts(child)
    else:
        node['childCount'] = 0

add_child_counts(hierarchy)

# Calculate branch metrics
def calc_branch_metrics(node):
    if 'children' not in node or not isinstance(node.get('children'), dict) or len(node['children']) == 0:
        return {
            'entities': node.get('entityDensity', 0),
            'keywords': node.get('keywordCount', 0)
        }

    entity_total = node.get('entityDensity', 0)
    keyword_total = node.get('keywordCount', 0)

    for child_key, child in node['children'].items():
        child_metrics = calc_branch_metrics(child)
        entity_total += child_metrics['entities']
        keyword_total += child_metrics['keywords']

    node['branchEntityDensity'] = entity_total
    node['branchKeywordCount'] = keyword_total

    return {
        'entities': entity_total,
        'keywords': keyword_total
    }

calc_branch_metrics(hierarchy)

# Get category statistics
all_categories = []
for cats in pages['categories']:
    if isinstance(cats, list):
        all_categories.extend([c['name'] for c in cats])

from collections import Counter
category_counts = Counter(all_categories)

# Create output
output = {
    "AMA Travel Site Architecture": hierarchy,
    "metadata": {
        "totalPages": len(pages),
        "pagesWithKeywords": int((pages['keywordCount'] > 0).sum()),
        "pagesWithEntities": int((pages['entity_count'] > 0).sum()),
        "pagesWithCategories": int((pages['top_category'].notna()).sum()),
        "avgEntitiesPerPage": float(pages['entity_count'].mean().round(1)),
        "avgKeywordsPerPage": float(pages['keywordCount'].mean().round(1)),
        "avgCategoryConfidence": float(pages['top_category_confidence'].mean().round(3)) if pages['top_category_confidence'].notna().any() else None,
        "topCategories": dict(category_counts.most_common(10))
    }
}

# Save JSON
output_filename = 'ama_travel_hierarchy_google_nlp.json'
with open(output_filename, 'w') as f:
    json.dump(output, f, indent=2)

print(f"\nüéØ Created {output_filename}")
print(f"\nüìä Summary Statistics:")
print(f"   Total pages: {len(pages)}")
print(f"   Pages with categories: {(pages['top_category'].notna()).sum()}")
print(f"   Pages with entities: {(pages['entity_count'] > 0).sum()}")
print(f"   Avg category confidence: {pages['top_category_confidence'].mean():.2%}" if pages['top_category_confidence'].notna().any() else "   No categories detected")

print(f"\nüèÜ Top sections by entity coverage:")
# Get root children safely
root_children = hierarchy.get('children', {})
if isinstance(root_children, dict):
    sorted_sections = sorted(root_children.items(),
                            key=lambda x: x[1].get('branchEntityDensity', 0),
                            reverse=True)[:5]
    for key, section in sorted_sections:
        entities = section.get('branchEntityDensity', 0)
        keywords = section.get('branchKeywordCount', 0)
        num_children = len(section.get('children', {})) if isinstance(section.get('children'), dict) else 0
        print(f"   {section.get('title', key):30} {entities:3} entities, {keywords:3} keywords, {num_children:2} pages")

print(f"\nüìÇ Top 10 Google NLP Categories detected:")
for category, count in category_counts.most_common(10):
    print(f"   {category}: {count} pages")

print("\n‚¨áÔ∏è Downloading JSON file...")
files.download(output_filename)

print("\n‚ú® Load into todiagram.com to visualize!")
print("   Each node includes:")
print("   ‚Ä¢ Google NLP category with confidence score")
print("   ‚Ä¢ Top 3 entities with salience scores")
print("   ‚Ä¢ Primary keywords for SEO context")


Processing 143 unique pages with Google NLP API...
‚ö†Ô∏è  Note: This will make API calls and may take a few minutes

  Processed 10/143 pages...
  Processed 20/143 pages...
  Processed 30/143 pages...
  Processed 40/143 pages...
  Processed 50/143 pages...
  Processed 60/143 pages...
  Processed 70/143 pages...
  Processed 80/143 pages...
  Processed 90/143 pages...
  Processed 100/143 pages...
  Processed 110/143 pages...
  Processed 120/143 pages...
  Processed 130/143 pages...
  Processed 140/143 pages...

‚úÖ Google NLP analysis complete!
   Pages analyzed: 143
   Pages with categories: 132
   Pages with entities: 134
   Average entities per page: 2.8
   ‚ö†Ô∏è  Pages with errors: 9

üéØ Created ama_travel_hierarchy_google_nlp.json

üìä Summary Statistics:
   Total pages: 143
   Pages with categories: 132
   Pages with entities: 134
   Avg category confidence: 91.98%

üèÜ Top sections by entity coverage:
   Homepage                       365 entities, 997 keywords, 16 pages
  

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


‚ú® Load into todiagram.com to visualize!
   Each node includes:
   ‚Ä¢ Google NLP category with confidence score
   ‚Ä¢ Top 3 entities with salience scores
   ‚Ä¢ Primary keywords for SEO context
