<a href="https://colab.research.google.com/github/sh1nysparkly/relevance-validation/blob/main/NLP_KW_Prio_Validator_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
KEYWORD ARRANGEMENT OPTIMIZER
==============================
Upload a CSV with your keywords and test different arrangements to find
which produces the strongest topical signal from Google's NLP API.

YOUR CSV FORMAT:
- Slug (page name)
- Keyword
- Search Volume
- Priority (Primary, Secondary, or Tertiary)

STEPS:
1. Run the setup cell
2. Upload your CSV when prompted
3. Enter your target category when prompted
4. Review results!
"""

# ============================================================================
# STEP 1: INSTALL & IMPORT
# ============================================================================

!pip install google-cloud-language pandas -q

from google.cloud import language_v1
import pandas as pd
from google.colab import files
import os
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries installed")

‚úÖ Libraries installed


In [None]:
# ============================================================================
# STEP 2: AUTHENTICATE WITH GOOGLE CLOUD
# ============================================================================

# Option B: Upload service account JSON file
from google.colab import files
uploaded = files.upload()
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = list(uploaded.keys())[0]

print("‚úÖ Libraries loaded and authenticated")

Saving nlp-entity-detection-79a294e928f3.json to nlp-entity-detection-79a294e928f3.json
‚úÖ Libraries loaded and authenticated


In [None]:
# ============================================================================
# STEP 3: UPLOAD YOUR KEYWORDS CSV
# ============================================================================

print("\n" + "="*80)
print("üì§ UPLOAD YOUR KEYWORDS CSV")
print("="*80)
print("""
Your CSV should have columns:
- Slug (page identifier)
- Keyword (the actual keyword)
- Search Volume (monthly volume)
- Priority (Primary, Secondary, or Tertiary)

Upload your file now:
""")

uploaded = files.upload()

if not uploaded:
    print("‚ùå No file uploaded - stopping here")
    raise Exception("Please upload a CSV file")

# Load the CSV
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print(f"\n‚úÖ Loaded {len(df)} keywords from {filename}")
print(f"\nColumns found: {list(df.columns)}")
print(f"\nFirst 3 rows:")
print(df.head(3).to_string(index=False))

# Check for required columns (with flexible naming)
required_cols = {
    'slug': ['slug', 'page', 'url', 'page_name'],
    'keyword': ['keyword', 'keywords', 'term'],
    'volume': ['search volume', 'volume', 'search_volume', 'sv'],
    'priority': ['priority', 'tier', 'recommendation', 'level']
}

# Map user's columns to standard names
col_mapping = {}
for standard_name, possible_names in required_cols.items():
    found = False
    for col in df.columns:
        if col.lower() in possible_names:
            col_mapping[standard_name] = col
            found = True
            break
    if not found and standard_name in ['slug', 'keyword']:
        print(f"‚ùå Required column '{standard_name}' not found")
        print(f"   Expected one of: {', '.join(possible_names)}")
        raise Exception(f"Missing required column: {standard_name}")

# Rename columns to standard names
df_renamed = df.rename(columns={v: k for k, v in col_mapping.items()})

# Get unique pages
unique_pages = df_renamed['slug'].unique()

print(f"\n‚úÖ Found {len(unique_pages)} unique pages:")
for i, page in enumerate(unique_pages[:10], 1):
    count = len(df_renamed[df_renamed['slug'] == page])
    print(f"   {i}. {page} ({count} keywords)")
if len(unique_pages) > 10:
    print(f"   ... and {len(unique_pages) - 10} more")


üì§ UPLOAD YOUR KEYWORDS CSV

Your CSV should have columns:
- Slug (page identifier)
- Keyword (the actual keyword)
- Search Volume (monthly volume)
- Priority (Primary, Secondary, or Tertiary)

Upload your file now:



Saving family vacation packages test.csv to family vacation packages test (1).csv

‚úÖ Loaded 394 keywords from family vacation packages test (1).csv

Columns found: ['Slug', 'Keyword', 'Search Volume', 'Priority']

First 3 rows:
                               Slug                   Keyword  Search Volume Priority
/vacation-packages/corporate-travel travel incentive programs           20.0  Primary
/vacation-packages/corporate-travel incentive travel programs           10.0  Primary
/vacation-packages/corporate-travel          incentive travel          150.0  Primary

‚úÖ Found 19 unique pages:
   1. /vacation-packages/corporate-travel (13 keywords)
   2. /vacation-packages/destination-weddings (42 keywords)
   3. /vacation-packages/honeymoons (38 keywords)
   4. /vacation-packages/adventure (34 keywords)
   5. /vacation-packages/long-stay (31 keywords)
   6. /vacation-packages/beach (29 keywords)
   7. /vacation-packages/theme-parks (17 keywords)
   8. /vacation-packages/family (32 ke

In [None]:
# ============================================================================
# STEP 4: SELECT PAGE TO TEST
# ============================================================================

print("\n" + "="*80)
print("üéØ SELECT PAGE TO TEST")
print("="*80)

# Show numbered list
print("\nAvailable pages:")
for i, page in enumerate(unique_pages, 1):
    count = len(df_renamed[df_renamed['slug'] == page])
    print(f"{i}. {page} ({count} keywords)")

# Get user selection
page_number = input(f"\nEnter page number to test (1-{len(unique_pages)}): ")
try:
    page_idx = int(page_number) - 1
    selected_page = unique_pages[page_idx]
    print(f"\n‚úÖ Selected: {selected_page}")
except:
    print("‚ùå Invalid selection, using first page")
    selected_page = unique_pages[0]

# Filter to selected page
page_keywords = df_renamed[df_renamed['slug'] == selected_page].copy()

print(f"\nüìä Keywords for {selected_page}:")
priority_counts = page_keywords['priority'].value_counts()
print(priority_counts.to_string())


üéØ SELECT PAGE TO TEST

Available pages:
1. /vacation-packages/corporate-travel (13 keywords)
2. /vacation-packages/destination-weddings (42 keywords)
3. /vacation-packages/honeymoons (38 keywords)
4. /vacation-packages/adventure (34 keywords)
5. /vacation-packages/long-stay (31 keywords)
6. /vacation-packages/beach (29 keywords)
7. /vacation-packages/theme-parks (17 keywords)
8. /vacation-packages/family (32 keywords)
9. /things-to-do/family (17 keywords)
10. /things-to-do/friends (23 keywords)
11. /things-to-do/couples (11 keywords)
12. /vacation-packages/couples (21 keywords)
13. /vacation-packages/food-wine (7 keywords)
14. /things-to-do/wine-food (9 keywords)
15. /vacation-packages/golf-vacations-sports (22 keywords)
16. /things-to-do/golf-sports (9 keywords)
17. /vacation-packages/pet-friendly-travel (8 keywords)
18. /things-to-do/pet-friendly (15 keywords)
19. /things-to-do/accessible (16 keywords)

Enter page number to test (1-19): 8

‚úÖ Selected: /vacation-packages/family


In [None]:
# ============================================================================
# STEP 5: ENTER TARGET CATEGORY
# ============================================================================

print("\n" + "="*80)
print("üéØ TARGET CATEGORY")
print("="*80)
print("""
Enter the category you want this page to be detected as.

Examples:
- Travel
- Business
- Food & Drink
- Sports
- Arts & Entertainment

Common Google NLP categories:
- /Arts & Entertainment
- /Autos & Vehicles
- /Beauty & Fitness
- /Business & Industrial
- /Computers & Electronics
- /Finance
- /Food & Drink
- /Games
- /Health
- /Hobbies & Leisure
- /Home & Garden
- /Internet & Telecom
- /Jobs & Education
- /Law & Government
- /News
- /Online Communities
- /People & Society
- /Pets & Animals
- /Real Estate
- /Reference
- /Science
- /Shopping
- /Sports
- /Travel & Transportation

Tip: Use broad terms like "Travel" - they'll match any subcategory
Full list: https://cloud.google.com/natural-language/docs/categories
""")

target_category = input("Enter your target category: ")
print(f"\n‚úÖ Target: {target_category}")


üéØ TARGET CATEGORY

Enter the category you want this page to be detected as.

Examples:
- Travel
- Business
- Food & Drink
- Sports
- Arts & Entertainment

Common Google NLP categories:
- /Arts & Entertainment
- /Autos & Vehicles
- /Beauty & Fitness
- /Business & Industrial
- /Computers & Electronics
- /Finance
- /Food & Drink
- /Games
- /Health
- /Hobbies & Leisure
- /Home & Garden
- /Internet & Telecom
- /Jobs & Education
- /Law & Government
- /News
- /Online Communities
- /People & Society
- /Pets & Animals
- /Real Estate
- /Reference
- /Science
- /Shopping
- /Sports
- /Travel & Transportation

Tip: Use broad terms like "Travel" - they'll match any subcategory
Full list: https://cloud.google.com/natural-language/docs/categories

Enter your target category: /Travel & Transportation/Specialty Travel/Family Travel

‚úÖ Target: /Travel & Transportation/Specialty Travel/Family Travel


In [None]:
# ============================================================================
# STEP 6: CONTENT SIMULATION FUNCTION
# ============================================================================

def simulate_page_content(primary_kws, secondary_kws, tertiary_kws, brand_name="Your Brand"):
    """
    Matches the format from your other tool:
    Just space-separated keywords, no fancy structure
    """
    if not primary_kws:
        return ""

    # Combine ALL keywords with spaces (like your other tool does)
    all_keywords = primary_kws + secondary_kws + tertiary_kws

    # Just join with spaces
    return " ".join(all_keywords)


In [None]:
# ============================================================================
# STEP 7: GOOGLE NLP ANALYSIS FUNCTION
# ============================================================================

def analyze_with_google_nlp(content_text, target_category=None):
    """Analyze content with Google NLP API using annotate_text"""
    client = language_v1.LanguageServiceClient()

    document = language_v1.Document(
        content=content_text,
        type_=language_v1.Document.Type.PLAIN_TEXT
    )

    try:
        # Use annotate_text like the other tool
        response = client.annotate_text(
            document=document,
            features={
                'extract_entities': True,
                'classify_text': True
            }
        )

        categories = response.categories
        entities = response.entities

    except Exception as e:
        print(f"‚ö†Ô∏è  NLP API error: {e}")
        return None

    if not categories:
        return {
            'top_category': None,
            'top_confidence': 0,
            'all_categories': [],
            'top_entities': [],
            'matches_target': False,
            'target_confidence': 0
        }

    # Parse entities
    top_entities = []
    for ent in entities[:5]:
        try:
            if hasattr(ent.type_, 'name'):
                entity_type = ent.type_.name
            else:
                entity_type = language_v1.Entity.Type(ent.type_).name
        except:
            entity_type = str(ent.type_)
        top_entities.append((ent.name, ent.salience, entity_type))

    results = {
        'top_category': categories[0].name,
        'top_confidence': categories[0].confidence,
        'all_categories': [(cat.name, cat.confidence) for cat in categories[:5]],
        'top_entities': top_entities,
        'matches_target': False,
        'target_confidence': 0
    }

    if target_category:
        for cat in categories:
            if target_category.lower() in cat.name.lower():
                results['matches_target'] = True
                results['target_confidence'] = cat.confidence
                results['matched_category'] = cat.name
                break

    return results

In [None]:
# ============================================================================
# STEP 8: RUN THE TESTS
# ============================================================================

print("\n" + "="*80)
print(f"üß™ TESTING: {selected_page}")
print("="*80)

# Prepare keyword list
keyword_list = []
for _, row in page_keywords.iterrows():
    keyword_list.append({
        'keyword': row['keyword'],
        'volume': row.get('volume', 0),
        'tier': row['priority'].lower()
    })

# Organize by tier
primary = [kw for kw in keyword_list if 'primary' in kw['tier'].lower()]
secondary = [kw for kw in keyword_list if 'secondary' in kw['tier'].lower()]
tertiary = [kw for kw in keyword_list if 'tertiary' in kw['tier'].lower()]

print(f"\nKeyword Distribution:")
print(f"  Primary: {len(primary)}")
print(f"  Secondary: {len(secondary)}")
print(f"  Tertiary: {len(tertiary)}")

# Define test arrangements
arrangements = [
    {
        'name': 'Baseline (Current)',
        'description': 'Your current keyword arrangement',
        'primary': [kw['keyword'] for kw in primary],
        'secondary': [kw['keyword'] for kw in secondary],
        'tertiary': [kw['keyword'] for kw in tertiary]
    },
    {
        'name': 'Primary Focus',
        'description': 'Only primary keywords',
        'primary': [kw['keyword'] for kw in primary[:2]],
        'secondary': [],
        'tertiary': []
    },
    {
        'name': 'Balanced Top Terms',
        'description': 'Top 2 primary + top 3 secondary',
        'primary': [kw['keyword'] for kw in primary[:2]],
        'secondary': [kw['keyword'] for kw in secondary[:3]],
        'tertiary': []
    },
]

# Add swap test if applicable
if len(primary) > 0 and len(secondary) > 0:
    arrangements.append({
        'name': 'Swap Primary/Secondary',
        'description': 'Test if secondary term is stronger',
        'primary': [secondary[0]['keyword']],
        'secondary': [primary[0]['keyword']] + [kw['keyword'] for kw in secondary[1:3]],
        'tertiary': []
    })

# Add high-volume test
all_kws = primary + secondary + tertiary
sorted_by_vol = sorted(all_kws, key=lambda x: x.get('volume', 0), reverse=True)
arrangements.append({
    'name': 'High-Volume Focus',
    'description': 'Prioritize highest volume',
    'primary': [sorted_by_vol[0]['keyword']],
    'secondary': [kw['keyword'] for kw in sorted_by_vol[1:4]],
    'tertiary': []
})

arrangements.append({
    'name': 'All-In (Full Coverage)',
    'description': 'Include all keywords',
    'primary': [kw['keyword'] for kw in primary],
    'secondary': [kw['keyword'] for kw in secondary],
    'tertiary': [kw['keyword'] for kw in tertiary]  # Remove the [:5] cap
})


üß™ TESTING: /vacation-packages/family

Keyword Distribution:
  Primary: 4
  Secondary: 9
  Tertiary: 19


In [None]:
# In the test loop, RIGHT BEFORE calling analyze_with_google_nlp, add this:

for i, arrangement in enumerate(arrangements, 1):
    print(f"\nüî¨ Test {i}/{len(arrangements)}: {arrangement['name']}")

    content = simulate_page_content(
        arrangement['primary'],
        arrangement['secondary'],
        arrangement['tertiary']
    )

    # ADD THIS DEBUG OUTPUT:
    print(f"   DEBUG - Content length: {len(content)} chars, {len(content.split())} words")
    print(f"   DEBUG - First 100 chars: {content[:100]}")

    if not content:
        continue

    nlp_result = analyze_with_google_nlp(content, target_category)
    # ... rest of code


üî¨ Test 1/6: Baseline (Current)
   DEBUG - Content length: 990 chars, 128 words
   DEBUG - First 100 chars: family vacations family vacation specials all inclusive all inclusive family vacation packages famil

üî¨ Test 2/6: Primary Focus
   DEBUG - Content length: 55 chars, 7 words
   DEBUG - First 100 chars: family vacations family vacation specials all inclusive
‚ö†Ô∏è  NLP API error: 400 Invalid text content: too few tokens (words) to process. [field_violations {
  field: "document"
  description: "Invalid text content: too few tokens (words) to process."
}
]

üî¨ Test 3/6: Balanced Top Terms
   DEBUG - Content length: 120 chars, 16 words
   DEBUG - First 100 chars: family vacations family vacation specials all inclusive family vacation packages family resorts all 
‚ö†Ô∏è  NLP API error: 400 Invalid text content: too few tokens (words) to process. [field_violations {
  field: "document"
  description: "Invalid text content: too few tokens (words) to process."
}
]

üî¨ Test 4/6

In [None]:
# Run tests
results = []
print(f"\n{'='*80}")
print("Running NLP Analysis...")
print(f"{'='*80}")

for i, arrangement in enumerate(arrangements, 1):
    print(f"\nüî¨ Test {i}/{len(arrangements)}: {arrangement['name']}")

    content = simulate_page_content(
        arrangement['primary'],
        arrangement['secondary'],
        arrangement['tertiary']
    )

    if not content:
        continue

    nlp_result = analyze_with_google_nlp(content, target_category)

    if not nlp_result:
        continue

    result = {
        'arrangement': arrangement['name'],
        'description': arrangement['description'],
        'primary_kws': arrangement['primary'][:3],
        'num_primary': len(arrangement['primary']),
        'num_secondary': len(arrangement['secondary']),
        'num_tertiary': len(arrangement['tertiary']),
        'detected_category': nlp_result['top_category'],
        'confidence': nlp_result['top_confidence'],
        'matches_target': nlp_result['matches_target'],
        'target_confidence': nlp_result['target_confidence'],
        'matched_category': nlp_result.get('matched_category', 'N/A'),
        'all_categories': nlp_result['all_categories'][:3],
        'top_entities': nlp_result['top_entities'][:3]
    }
    results.append(result)

    print(f"   Primary: {', '.join(result['primary_kws'])}")
    print(f"   Detected: {result['detected_category']}")
    print(f"   Confidence: {result['confidence']:.1%}")
    if result['matches_target']:
        print(f"   ‚úÖ MATCHES target! ({result['target_confidence']:.1%})")
    else:
        print(f"   ‚ùå Does NOT match target")


Running NLP Analysis...

üî¨ Test 1/6: Baseline (Current)
   Primary: family vacations, family vacation specials all inclusive, all inclusive family vacation packages
   Detected: /Travel
   Confidence: 99.0%
   ‚ùå Does NOT match target

üî¨ Test 2/6: Primary Focus
‚ö†Ô∏è  NLP API error: 400 Invalid text content: too few tokens (words) to process. [field_violations {
  field: "document"
  description: "Invalid text content: too few tokens (words) to process."
}
]

üî¨ Test 3/6: Balanced Top Terms
‚ö†Ô∏è  NLP API error: 400 Invalid text content: too few tokens (words) to process. [field_violations {
  field: "document"
  description: "Invalid text content: too few tokens (words) to process."
}
]

üî¨ Test 4/6: Swap Primary/Secondary
‚ö†Ô∏è  NLP API error: 400 Invalid text content: too few tokens (words) to process. [field_violations {
  field: "document"
  description: "Invalid text content: too few tokens (words) to process."
}
]

üî¨ Test 5/6: High-Volume Focus
   Primary: grea

In [None]:
# ============================================================================
# STEP 9: SHOW RESULTS
# ============================================================================

if not results:
    print("\n‚ùå No results generated")
else:
    best = max(results, key=lambda x: (x['target_confidence'], x['confidence']))

    print(f"\n{'='*80}")
    print("üìä RESULTS SUMMARY")
    print(f"{'='*80}")

    results_df = pd.DataFrame([{
        'Arrangement': r['arrangement'],
        'Match': '‚úÖ' if r['matches_target'] else '‚ùå',
        'Target': f"{r['target_confidence']:.1%}",
        'Overall': f"{r['confidence']:.1%}",
        'KWs': f"{r['num_primary']}/{r['num_secondary']}/{r['num_tertiary']}"
    } for r in results])

    print("\n" + results_df.to_string(index=False))

    print(f"\n{'='*80}")
    print(f"üèÜ RECOMMENDED: {best['arrangement']}")
    print(f"{'='*80}")
    print(f"{best['description']}")
    print(f"\nPrimary ({best['num_primary']}): {', '.join(best['primary_kws'])}")
    print(f"Secondary: {best['num_secondary']}")
    print(f"Tertiary: {best['num_tertiary']}")
    print(f"\nDetected: {best['detected_category']}")
    print(f"Confidence: {best['confidence']:.1%}")
    if best['matches_target']:
        print(f"‚úÖ Target Match: {best['matched_category']} ({best['target_confidence']:.1%})")

    print(f"\nTop Entities:")
    for entity, salience, ent_type in best['top_entities']:
        print(f"  ‚Ä¢ {entity} (salience: {salience:.2f}, type: {ent_type})")

    # Export
    results_export = pd.DataFrame(results)
    results_export.to_csv(f'arrangement_results_{selected_page.replace("/", "_")}.csv', index=False)
    print(f"\n‚úÖ Exported results to arrangement_results_{selected_page.replace('/', '_')}.csv")

print("\n" + "="*80)
print("‚úÖ COMPLETE!")
print("="*80)


üìä RESULTS SUMMARY

           Arrangement Match Target Overall    KWs
    Baseline (Current)     ‚ùå   0.0%   99.0% 4/9/19
    Balanced Top Terms     ‚ùå   0.0%   99.0%  2/3/0
     High-Volume Focus     ‚ùå   0.0%   99.0%  1/3/0
All-In (Full Coverage)     ‚ùå   0.0%   99.0% 4/9/19

üèÜ RECOMMENDED: Baseline (Current)
Your current keyword arrangement

Primary (4): family vacations, family vacation specials all inclusive, all inclusive family vacation packages
Secondary: 9
Tertiary: 19

Detected: /Travel
Confidence: 99.0%

Top Entities:
  ‚Ä¢ family vacations (salience: 0.34, type: EVENT)
  ‚Ä¢ family vacations (salience: 0.18, type: EVENT)
  ‚Ä¢ family vacation specials (salience: 0.11, type: OTHER)

‚úÖ Exported results to arrangement_results__vacation-packages_family.csv

‚úÖ COMPLETE!
