<a href="https://colab.research.google.com/github/sh1nysparkly/relevance-validation/blob/main/NLP_Keyword_Order_Anaysis_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
KEYWORD ORDER OPTIMIZER V3 - USING API V2
==========================================
Now using the v2 API like the demo does!

CSV FORMAT:
- URL Slug (or Slug)
- Page Name
- Keyword
- Search Volume (optional)
"""

# ============================================================================
# SETUP
# ============================================================================

!pip install google-cloud-language pandas -q

from google.cloud import language_v2  # V2!!!
import pandas as pd
from google.colab import files
import os
import random
import time
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries installed (using API v2)")

# ============================================================================
# AUTHENTICATE
# ============================================================================

from google.colab import userdata
try:
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = userdata.get('GOOGLE_CLOUD_KEY_PATH')
    print("‚úÖ Authenticated via Colab secrets")
except:
    print("\nüì§ Upload your Google Cloud service account JSON:")
    uploaded_auth = files.upload()
    if uploaded_auth:
        auth_file = list(uploaded_auth.keys())[0]
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = auth_file
        print(f"‚úÖ Authenticated")

client = language_v2.LanguageServiceClient()

# ============================================================================
# UPLOAD CSV
# ============================================================================

print("\n" + "="*80)
print("üì§ UPLOAD YOUR KEYWORDS CSV")
print("="*80)

uploaded = files.upload()
if not uploaded:
    raise Exception("Please upload a CSV")

filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print(f"\n‚úÖ Loaded {len(df)} rows")
print(f"Columns: {list(df.columns)}")

# Normalize column names
col_mapping = {}
for col in df.columns:
    col_lower = col.lower().strip()
    if col_lower in ['slug', 'url slug', 'url_slug', 'page']:
        col_mapping[col] = 'slug'
    elif col_lower in ['page name', 'page_name', 'title']:
        col_mapping[col] = 'page_name'
    elif col_lower in ['keyword', 'keywords']:
        col_mapping[col] = 'keyword'
    elif col_lower in ['volume', 'search volume', 'search_volume']:
        col_mapping[col] = 'volume'

df = df.rename(columns=col_mapping)

# Get unique pages
pages = df.groupby('slug').agg({
    'page_name': 'first',
    'keyword': lambda x: list(x)
}).reset_index()
pages.columns = ['slug', 'page_name', 'keywords']

if 'volume' in df.columns:
    volumes = df.groupby('slug')['volume'].apply(list).reset_index()
    volumes.columns = ['slug', 'volumes']
    pages = pages.merge(volumes, on='slug')
else:
    pages['volumes'] = pages['keywords'].apply(lambda x: [0] * len(x))

print(f"\n‚úÖ Found {len(pages)} unique pages")

# ============================================================================
# SELECT PAGE
# ============================================================================

print("\n" + "="*80)
print("üéØ SELECT PAGE")
print("="*80)

for i, row in pages.iterrows():
    print(f"{i+1}. {row['page_name']} - {row['slug']} ({len(row['keywords'])} keywords)")

page_idx = int(input(f"\nEnter page number (1-{len(pages)}): ")) - 1
selected = pages.iloc[page_idx]

page_name = selected['page_name']
page_slug = selected['slug']
keywords = selected['keywords']
volumes = selected['volumes']

print(f"\n‚úÖ Selected: {page_name}")
print(f"   Keywords: {len(keywords)}")

keyword_list = [{'keyword': k, 'volume': v} for k, v in zip(keywords, volumes)]

# ============================================================================
# V2 ANALYSIS FUNCTION
# ============================================================================

def analyze_with_v2_api(text):
    """
    Use v2 API to classify text - matches what the demo does
    """
    if not text or text.strip() == '':
        return None

    try:
        document = language_v2.Document(
            content=text,
            type_=language_v2.Document.Type.PLAIN_TEXT
        )

        # Classify using v2
        response = client.classify_text(
            request={'document': document}
        )

        # Extract categories
        categories = []
        for category in response.categories:
            categories.append({
                'name': category.name,
                'confidence': round(category.confidence, 4)
            })

        categories.sort(key=lambda x: x['confidence'], reverse=True)

        return {
            'categories': categories,
            'error': None
        }

    except Exception as e:
        return {
            'categories': [],
            'error': str(e)
        }

# ============================================================================
# BUILD TEST ORDERINGS
# ============================================================================

print("\n" + "="*80)
print("üß™ BUILDING TEST ORDERINGS")
print("="*80)

orderings = []

# Test 1: Original order
orderings.append({
    'name': 'Original Order',
    'keywords': keywords.copy()
})

# Test 2-5: Lead with top volume keywords
if any(v > 0 for v in volumes):
    sorted_kws = sorted(keyword_list, key=lambda x: x['volume'], reverse=True)
    for i, kw_obj in enumerate(sorted_kws[:4]):
        reordered = [kw_obj['keyword']] + [k for k in keywords if k != kw_obj['keyword']]
        orderings.append({
            'name': f"Lead: {kw_obj['keyword'][:35]}...",
            'keywords': reordered,
            'lead_kw': kw_obj['keyword']
        })

# Test 6: Longest first
by_length = sorted(keywords, key=len, reverse=True)
orderings.append({
    'name': 'Longest First',
    'keywords': by_length,
    'lead_kw': by_length[0]
})

# Test 7: Shortest first
by_length_asc = sorted(keywords, key=len)
orderings.append({
    'name': 'Shortest First',
    'keywords': by_length_asc,
    'lead_kw': by_length_asc[0]
})

# Test 8-10: Random shuffles
for i in range(3):
    shuffled = keywords.copy()
    random.shuffle(shuffled)
    orderings.append({
        'name': f'Random #{i+1}',
        'keywords': shuffled,
        'lead_kw': shuffled[0]
    })

print(f"Testing {len(orderings)} orderings\n")

# ============================================================================
# RUN TESTS WITH BOTH FORMATS
# ============================================================================

print("="*80)
print(f"üî¨ TESTING: {page_name}")
print("="*80)
print("\nWe'll test with NEWLINES (like the demo uses)\n")

results = []

for i, test in enumerate(orderings, 1):
    print(f"Test {i}/{len(orderings)}: {test['name']}")

    # Use NEWLINES like the demo (not spaces!)
    keywords_text = '\n'.join(test['keywords'])
    text = f"{page_name}\n{keywords_text}"

    print(f"  Length: {len(text)} chars, {len(text.split())} words")
    print(f"  First 80 chars: {text[:80].replace(chr(10), ' | ')}...")

    # Analyze with v2
    result = analyze_with_v2_api(text)

    if not result or not result['categories']:
        print(f"  ‚ö†Ô∏è  No categories detected")
        if result and result.get('error'):
            print(f"  Error: {result['error']}")
        print()
        continue

    # Store result
    top_cat = result['categories'][0]
    depth = top_cat['name'].count('/')

    result_data = {
        'ordering': test['name'],
        'lead_keyword': test.get('lead_kw', test['keywords'][0]),
        'top_category': top_cat['name'],
        'confidence': top_cat['confidence'],
        'depth': depth,
        'all_categories': result['categories'][:5]
    }
    results.append(result_data)

    print(f"  ‚Üí {top_cat['name']}")
    print(f"  Confidence: {top_cat['confidence']:.1%}, Depth: {depth}")
    print()

    # Be nice to API
    time.sleep(0.3)

# ============================================================================
# ANALYZE RESULTS
# ============================================================================

if not results:
    print("\n‚ùå No results generated")
else:
    print("\n" + "="*80)
    print("üìä RESULTS SUMMARY")
    print("="*80)

    summary_df = pd.DataFrame([{
        'Ordering': r['ordering'][:40],
        'Lead KW': r['lead_keyword'][:30],
        'Depth': r['depth'],
        'Conf': f"{r['confidence']:.1%}",
        'Category': r['top_category'][:55]
    } for r in results])

    print("\n" + summary_df.to_string(index=False))

    # Find most specific
    best = max(results, key=lambda x: (x['depth'], x['confidence']))

    print("\n" + "="*80)
    print("üèÜ MOST SPECIFIC CATEGORY")
    print("="*80)
    print(f"Ordering: {best['ordering']}")
    print(f"Lead Keyword: {best['lead_keyword']}")
    print(f"\nTop Category: {best['top_category']}")
    print(f"Confidence: {best['confidence']:.1%}")
    print(f"Depth: {best['depth']} levels")

    print(f"\nAll categories:")
    for cat in best['all_categories']:
        depth_indent = '  ' * cat['name'].count('/')
        print(f"{depth_indent}‚Ä¢ {cat['name']} ({cat['confidence']:.1%})")

    print("\n" + "="*80)
    print("üí° RECOMMENDATION")
    print("="*80)
    print(f"\nUse '{best['lead_keyword']}' as PRIMARY keyword:")
    print(f"  ‚Ä¢ Page Title: {best['lead_keyword']} | Brand")
    print(f"  ‚Ä¢ H1: {best['lead_keyword']}")
    print(f"  ‚Ä¢ First mention in content")

    # Check variance
    unique_cats = set([r['top_category'] for r in results])
    if len(unique_cats) > 1:
        print(f"\n‚ö†Ô∏è  {len(unique_cats)} different categories detected:")
        by_depth = sorted(unique_cats, key=lambda x: x.count('/'), reverse=True)
        for cat in by_depth:
            print(f"  ‚Ä¢ {cat}")
        print("\n‚Üí Keyword order MATTERS!")
    else:
        print("\n‚úÖ Same category for all orderings - order doesn't matter much")

    # Export
    export_df = pd.DataFrame(results)
    export_file = f'v2_keyword_test_{page_slug.replace("/", "_")}.csv'
    export_df.to_csv(export_file, index=False)
    print(f"\n‚úÖ Exported to {export_file}")
    files.download(export_file)

print("\n" + "="*80)
print("‚úÖ COMPLETE!")
print("="*80)

‚úÖ Libraries installed (using API v2)

üì§ Upload your Google Cloud service account JSON:


Saving nlp-entity-detection-79a294e928f3.json to nlp-entity-detection-79a294e928f3.json
‚úÖ Authenticated

üì§ UPLOAD YOUR KEYWORDS CSV


Saving Testing another kw nlp prioritizer.csv to Testing another kw nlp prioritizer.csv

‚úÖ Loaded 57 rows
Columns: ['#', 'Guide Status', 'Page ID', 'Section', 'Page Name', 'Supplier', 'URL Slug', 'Parent', 'Intent Type', 'Content Type', 'Journey Stage', 'Keyword', 'Search Volume', 'Priority', 'Page Copy']

‚úÖ Found 3 unique pages

üéØ SELECT PAGE
1. Walking & Hiking Tours - /things-to-do/walking-tours (26 keywords)
2. Wine & Food Tours - /things-to-do/wine-food (13 keywords)
3. Luxury Vacation Packages - /vacation-packages/luxury (18 keywords)

Enter page number (1-3): 2

‚úÖ Selected: Wine & Food Tours
   Keywords: 13

üß™ BUILDING TEST ORDERINGS
Testing 10 orderings

üî¨ TESTING: Wine & Food Tours

We'll test with NEWLINES (like the demo uses)

Test 1/10: Original Order
  Length: 203 chars, 34 words
  First 80 chars: Wine & Food Tours | cooking class | wine tastings | brewery tour | sake tasting | food tou...
  ‚Üí /Travel & Transportation/Specialty Travel/Other
  Confidence: 4

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


‚úÖ COMPLETE!
