<a href="https://colab.research.google.com/github/sh1nysparkly/relevance-validation/blob/main/NLP_Page_Title_Meta_Content_Block.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
Content Draft NLP Analyzer (v2)

This script takes a CSV of draft content, runs it through the Google NLP API,
and generates four output files for detailed analysis:
1. pages_summary.csv: A high-level overview of each page.
2. entities.csv: A detailed, one-row-per-entity breakdown.
3. categories.csv: A detailed, one-row-per-category breakdown.
4. content_draft_analysis.json: The full raw data in a nested format.
"""

import pandas as pd
import json
from google.colab import files
from google.cloud import language_v1
import time
import os

In [None]:
# --- 1. File Uploads ---
print("\nüîë Upload your Google Cloud service account JSON key:")
key_uploaded = files.upload()
key_filename = list(key_uploaded.keys())[0]
print(f"‚úÖ Loaded credentials: {key_filename}")


üîë Upload your Google Cloud service account JSON key:


Saving nlp-entity-detection-79a294e928f3.json to nlp-entity-detection-79a294e928f3 (8).json
‚úÖ Loaded credentials: nlp-entity-detection-79a294e928f3 (8).json


In [None]:
print("üìÅ Upload your CSV file with content drafts:")
print("   (It needs columns: 'page_identifier', 'page_title', 'meta_description', 'body_copy')")
uploaded = files.upload()
csv_filename = list(uploaded.keys())[0]
print(f"‚úÖ Loaded: {csv_filename}")


üìÅ Upload your CSV file with content drafts:
   (It needs columns: 'page_identifier', 'page_title', 'meta_description', 'body_copy')


Saving copy_input_templated.csv to copy_input_templated (6).csv
‚úÖ Loaded: copy_input_templated (6).csv


In [None]:
# --- 2 Authentication ---
# This assumes 'key_filename' and 'csv_filename' are already defined from the previous cell
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_filename
client = language_v1.LanguageServiceClient()

print(f"\nLoading data from {csv_filename}...")
try:
    # THE FIX: 'utf-8-sig' is specifically designed to handle the BOM ('√Ø¬ª¬ø')
    content_df = pd.read_csv(csv_filename, encoding='utf-8-sig')
    print("‚úÖ Successfully loaded CSV and handled BOM.")

    print("\nüïµÔ∏è  Headers as read from file:")
    print(list(content_df.columns))

except Exception as e:
    print(f"‚ùå Failed to load CSV. Error: {e}")
    raise

# Ensure we handle empty cells gracefully
if not content_df.empty:
    content_df = content_df.fillna('')


Loading data from copy_input_templated (6).csv...
‚úÖ Successfully loaded CSV and handled BOM.

üïµÔ∏è  Headers as read from file:
['page_identifier', 'page_title', 'meta_description', 'body_copy']


In [None]:
# --- 3. The Core NLP Analysis Function (Upgraded for Total Mentions Count) ---
def analyze_text_content(text):
    if not text or text.strip() == '':
        return { 'categories': [], 'entities': [], 'error': 'Input text was empty.' }

    try:
        document = language_v1.Document(
            content=text,
            type_=language_v1.Document.Type.HTML
        )
        features = {'extract_entities': True, 'classify_text': True}
        response = client.annotate_text(
            document=document,
            features=features,
            encoding_type=language_v1.EncodingType.UTF8
        )

        categories = [{
            'name': category.name,
            'confidence': round(category.confidence, 4)
        } for category in response.categories]
        categories.sort(key=lambda x: x['confidence'], reverse=True)

        entities = []
        for entity in response.entities:
            # Get the full list of all mention texts
            all_mentions_text = [mention.text.content for mention in entity.mentions]

            # NEW: Capture both the unique list and the total count
            unique_mentions_list = list(set(all_mentions_text))
            total_mentions_count = len(all_mentions_text)

            entities.append({
                'name': entity.name,
                'type': language_v1.Entity.Type(entity.type_).name,
                'salience': round(entity.salience, 4),
                'wikipedia_url': entity.metadata.get('wikipedia_url', ''),
                'mid': entity.metadata.get('mid', ''),
                'unique_mentions_list': unique_mentions_list,
                'total_mentions_count': total_mentions_count
            })
        entities.sort(key=lambda x: x['salience'], reverse=True)

        return { 'categories': categories, 'entities': entities, 'error': None }
    except Exception as e:
        return { 'categories': [], 'entities': [], 'error': str(e) }


In [None]:
# --- 4. Main Processing Loop ---
print(f"\nüöÄ Processing {len(content_df)} content drafts...")
analysis_results = []
for index, row in content_df.iterrows():
    identifier = row['page_identifier']
    print(f"  Analyzing: {identifier} ({index + 1}/{len(content_df)})...")

    combined_text = f"<h1>{row['page_title']}</h1> <p>{row['meta_description']}</p> {row['body_copy']}"
    result = analyze_text_content(combined_text)

    analysis_results.append({
        'page_identifier': identifier,
        'char_count': len(combined_text),
        'detected_categories': result['categories'],
        'detected_entities': result['entities'],
        'error': result['error']
    })
    time.sleep(0.5)

print("\n‚úÖ NLP analysis complete!")


üöÄ Processing 1 content drafts...
  Analyzing: /articles/ultimate-5-day-toronto-travel-guide (1/1)...

‚úÖ NLP analysis complete!


In [None]:
# --- 5. Data Structuring for CSV Outputs (with two mention columns) ---
print("\nüìä Preparing data for CSV export...")
pages_summary_data = []
entities_data = []
categories_data = []
for result in analysis_results:
    page_id = result['page_identifier']
    entities = result['detected_entities']
    categories = result['detected_categories']

    pages_summary_data.append({
        'page_identifier': page_id,
        'char_count': result['char_count'],
        'entity_count': len(entities),
        'category_count': len(categories),
        'top_entities': ', '.join([e['name'] for e in entities[:5]]),
        'top_categories': ', '.join([c['name'] for c in categories[:3]])
    })

    for entity in entities:
        # MODIFIED: Add separate columns for the unique list and the total count
        entities_data.append({
            'page_identifier': page_id,
            'entity': entity['name'],
            'type': entity['type'],
            'salience': entity['salience'],
            'unique_mentions': ' | '.join(entity['unique_mentions_list']),
            'total_mentions': entity['total_mentions_count'],
            'wikipedia_url': entity['wikipedia_url'],
            'mid': entity['mid']
        })

    for category in categories:
        categories_data.append({
            'page_identifier': page_id,
            'category': category['name'],
            'confidence': category['confidence']
        })



üìä Preparing data for CSV export...


In [None]:
# --- 6. Save All Output Files ---
pages_summary_df = pd.DataFrame(pages_summary_data)
entities_df = pd.DataFrame(entities_data)
categories_df = pd.DataFrame(categories_data)

summary_filename, entities_filename, categories_filename, json_filename = 'pages_summary.csv', 'entities.csv', 'categories.csv', 'content_draft_analysis.json'

pages_summary_df.to_csv(summary_filename, index=False)
print(f"  - Saved {summary_filename}")
entities_df.to_csv(entities_filename, index=False)
print(f"  - Saved {entities_filename}")
categories_df.to_csv(categories_filename, index=False)
print(f"  - Saved {categories_filename}")

output_json = {
    "metadata": { "total_documents_analyzed": len(content_df), "source_file": csv_filename },
    "analysis": analysis_results
}
with open(json_filename, 'w') as f:
    json.dump(output_json, f, indent=2)
print(f"  - Saved {json_filename}")

# --- 7. Download All Files ---
print("\n‚¨áÔ∏è Downloading your analysis files...")
files.download(summary_filename)
files.download(entities_filename)
files.download(categories_filename)
files.download(json_filename)

print("\n‚ú® All done! Check your downloads folder.")

  - Saved pages_summary.csv
  - Saved entities.csv
  - Saved categories.csv
  - Saved content_draft_analysis.json

‚¨áÔ∏è Downloading your analysis files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


‚ú® All done! Check your downloads folder.


AHHHhHHHHHH

In [None]:
# --- 2. Authentication & Setup ---

# This assumes 'key_filename' and 'csv_filename' are already defined from the previous cell
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_filename
client = language_v1.LanguageServiceClient()

print(f"\nLoading data from {csv_filename}...")
try:
    # We'll try a few common encodings to be safe
    content_df = pd.read_csv(csv_filename, encoding='latin1')
    print("‚úÖ Successfully loaded CSV with 'latin1' encoding.")
except Exception as e:
    print(f"Failed with 'latin1', trying 'windows-1252'... Error: {e}")
    try:
        content_df = pd.read_csv(csv_filename, encoding='windows-1252')
        print("‚úÖ Successfully loaded CSV with 'windows-1252' encoding.")
    except Exception as e:
        print(f"‚ùå Failed to load CSV with common encodings. Please check the file. Error: {e}")
        content_df = pd.DataFrame() # Create empty dataframe to avoid further errors

# Ensure we handle empty cells gracefully
if not content_df.empty:
    content_df = content_df.fillna('')


Loading data from copy_input_templated.csv...


In [None]:
# --- 3. The Core NLP Analysis Function ---
def analyze_text_content(text):
    """
    Analyzes a block of text with Google NLP API.
    Returns ALL categories and ALL entities.
    """
    if not text or text.strip() == '':
        return { 'categories': [], 'entities': [], 'error': 'Input text was empty.' }

    try:
        document = language_v1.Document(
            content=text,
            type_=language_v1.Document.Type.PLAIN_TEXT
        )
        features = { 'extract_entities': True, 'classify_text': True }
        response = client.annotate_text(document=document, features=features)

        # Extract ALL categories with confidence
        categories = [{
            'name': category.name,
            'confidence': round(category.confidence, 4)
        } for category in response.categories]
        categories.sort(key=lambda x: x['confidence'], reverse=True)

        # Extract ALL entities with salience
        # Note: We are NOT limiting the number of entities here
        entities = [{
            'name': entity.name,
            'type': language_v1.Entity.Type(entity.type_).name,
            'salience': round(entity.salience, 4)
        } for entity in response.entities]
        entities.sort(key=lambda x: x['salience'], reverse=True)

        return { 'categories': categories, 'entities': entities, 'error': None }

    except Exception as e:
        return { 'categories': [], 'entities': [], 'error': str(e) }

# --- 4. Main Processing Loop ---
print(f"\nProcessing {len(content_df)} content drafts...")
analysis_results = []

for index, row in content_df.iterrows():
    identifier = row['page_identifier']
    print(f"  Analyzing: {identifier} ({index + 1}/{len(content_df)})...")

    # Combine all text fields for the most complete context
    combined_text = f"{row['page_title']}. {row['meta_description']}. {row['body_copy']}"

    # Analyze the text
    result = analyze_text_content(combined_text)

    # Store the results
    analysis_results.append({
        'page_identifier': identifier,
        'detected_categories': result['categories'],
        'detected_entities': result['entities'],
        'error': result['error']
    })
    time.sleep(0.5) # Be a good citizen to the API

print("\n‚úÖ NLP analysis complete!")

# --- 5. Output Generation ---
# Create the final output object
output = {
    "metadata": {
        "total_documents_analyzed": len(content_df),
        "source_file": csv_filename
    },
    "analysis": analysis_results
}

# Save the detailed JSON file
output_filename = 'content_draft_analysis.json'
with open(output_filename, 'w') as f:
    json.dump(output, f, indent=2)

print(f"\nüéØ Created detailed results file: {output_filename}")

# Print a summary to the screen for quick review
print("\nüìä Quick Summary:")
for result in analysis_results:
    print(f"\n--- Page: {result['page_identifier']} ---")
    if result['error']:
        print(f"  ‚ö†Ô∏è Error: {result['error']}")
        continue

    # Show top 3 categories
    top_cats = result['detected_categories'][:3]
    if top_cats:
        print("  Top Categories:")
        for cat in top_cats:
            print(f"    - {cat['name']} (Confidence: {cat['confidence']:.2%})")
    else:
        print("  No categories detected.")

    # Show top 5 entities
    top_ents = result['detected_entities'][:5]
    if top_ents:
        print("  Top Entities:")
        for ent in top_ents:
            print(f"    - {ent['name']} (Salience: {ent['salience']:.3f}, Type: {ent['type']})")
    else:
        print("  No entities detected.")

# --- 6. Download the File ---
print(f"\n‚¨áÔ∏è Downloading {output_filename}...")
files.download(output_filename)

print("\n‚ú® All done!")


Processing 1 content drafts...
  Analyzing: /articles/ultimate-5-day-toronto-travel-guide (1/1)...

‚úÖ NLP analysis complete!

üéØ Created detailed results file: content_draft_analysis.json

üìä Quick Summary:

--- Page: /articles/ultimate-5-day-toronto-travel-guide ---
  Top Categories:
    - /Travel (Confidence: 97.00%)
  Top Entities:
    - FC Barcelona (Salience: 0.294, Type: LOCATION)
    - AMA (Salience: 0.070, Type: ORGANIZATION)
    - Barcelona -- Architecture & Art Awaits Nestled (Salience: 0.045, Type: ORGANIZATION)
    - Things (Salience: 0.027, Type: OTHER)
    - AMA Travel (Salience: 0.024, Type: OTHER)

‚¨áÔ∏è Downloading content_draft_analysis.json...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


‚ú® All done!
