In [1]:
from dotenv import load_dotenv
import os
import requests
import psycopg2
import time
from urllib.parse import urlparse

# Load environment variables
load_dotenv()
api_key = os.getenv("HF_API_KEY")
db_url = os.getenv("DB_CONN")  # Railway PostgreSQL connection URL

# Parse the connection URL
if db_url:
    result = urlparse(db_url)
    conn_params = {
        'dbname': result.path[1:],
        'user': result.username,
        'password': result.password,
        'host': result.hostname,
        'port': result.port
    }
else:
    raise ValueError("Database connection URL not found in environment variables")

# API endpoint for categorization
category_model_url = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
headers = {"Authorization": f"Bearer {api_key}"}

# Connect to PostgreSQL using Railway connection URL
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# Refined categories based on the actual article content shown
categories = [
    "Google Updates", 
    "Paid Search/PPC", 
    "SEO Strategy", 
    "Search Console", 
    "Local SEO",
    "Social Search",
    "Chrome Tools",
    "Google Ads"
]

# Get articles from database - using a sample of unclassified articles
cursor.execute("""
    SELECT id, title, url 
    FROM search_engine_land_articles 
    WHERE category IS NULL OR category = ''
    LIMIT 50
""")
articles = cursor.fetchall()

# Process each article
for article_id, title, url in articles:
    print(f"Processing article {article_id}: {title[:50]}...")
    
    # Classify article category using zero-shot classification
    category_payload = {
        "inputs": title,
        "parameters": {"candidate_labels": categories}
    }
    
    # Handle rate limiting with simple retry
    category_response = None
    for attempt in range(3):
        try:
            category_response = requests.post(category_model_url, headers=headers, json=category_payload)
            if category_response.status_code == 200:
                break
            print(f"API rate limit hit, waiting (attempt {attempt+1})")
            time.sleep(2 * (attempt + 1))  # Exponential backoff
        except Exception as e:
            print(f"Error on attempt {attempt+1}: {e}")
            time.sleep(3)
    
    if category_response and category_response.status_code == 200:
        category_result = category_response.json()
        top_category = category_result["labels"][0]
        category_score = category_result["scores"][0]
        
        # Update database with category
        cursor.execute(
            """
            UPDATE search_engine_land_articles 
            SET category = %s, category_confidence = %s
            WHERE id = %s
            """,
            (top_category, category_score, article_id)
        )
        conn.commit()
        print(f"✓ Classified: {top_category} ({category_score:.2f})")
    
    # Respect API rate limits
    time.sleep(1)

print(f"Processing complete! Categorized {len(articles)} articles.")
cursor.close()
conn.close()

Processing article 46: How to integrate GEO with SEO...
✓ Classified: Local SEO (0.26)
Processing article 96: How to use ChatGPT Tasks for SEO...
✓ Classified: SEO Strategy (0.65)
Processing article 48: Microsoft to enforce consent compliance for advert...
✓ Classified: Paid Search/PPC (0.35)
Processing article 49: Detailed demographics in Google Ads: Targeting wit...
✓ Classified: Google Ads (0.87)
Processing article 50: How to optimize for ROAS in Google Ads using LTV i...
✓ Classified: Google Ads (0.92)
Processing article 51: 7 power moves to accelerate your PPC career...
✓ Classified: Paid Search/PPC (0.39)
Processing article 52: How Deming’s 14 principles provide the foundation ...
✓ Classified: Social Search (0.20)
Processing article 53: Google Ads rolls out channel control for Demand Ge...
✓ Classified: Google Ads (0.86)
Processing article 54: Bing pushes ad-heavy search results with 7+ sponso...
✓ Classified: Paid Search/PPC (0.80)
Processing article 55: Google AI Overviews cau