In [36]:
import os
import json
import re
from typing import List, Dict, Any, Optional
from url2md4ai import Config, ContentExtractor
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from openai import OpenAI
from pydantic import BaseModel
from collections import Counter
from datetime import datetime
import hashlib

class PatternAnalysis(BaseModel):
    pattern_type: str
    primary_selectors: List[str]
    fallback_selectors: List[str]
    url_pattern_regex: str
    content_indicators: List[str]
    skip_patterns: List[str]
    confidence_score: float
    pattern_description: str
    estimated_items: int
    date_selectors: List[str]

class ExtractedPattern(BaseModel):
    links: List[Dict[str, Any]]
    total_found: int
    pattern_used: str
    confidence: float
    base_url: str
    pattern_analysis: PatternAnalysis

class UniversalPatternExtractor:
    def __init__(self, openai_api_key: str = None, cache_dir: str = "pattern_cache"):
        self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
        self.client = OpenAI(api_key=self.openai_api_key)
        self.extractor = ContentExtractor(Config.from_env())
        self.cache_dir = cache_dir
        
        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)
    
    def _get_domain_hash(self, url: str) -> str:
        """Generate a hash for the domain to use as cache key."""
        domain = urlparse(url).netloc
        return hashlib.md5(domain.encode()).hexdigest()[:8]
    
    def _get_cache_file_path(self, url: str) -> str:
        """Get the cache file path for a domain."""
        domain_hash = self._get_domain_hash(url)
        domain = urlparse(url).netloc
        return os.path.join(self.cache_dir, f"pattern_{domain}_{domain_hash}.json")
    
    def _load_cached_pattern(self, url: str) -> Optional[PatternAnalysis]:
        """Load cached pattern analysis for a domain."""
        cache_file = self._get_cache_file_path(url)
        
        if os.path.exists(cache_file):
            try:
                with open(cache_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    print(f"📦 Using cached pattern from: {cache_file}")
                    return PatternAnalysis(**data)
            except Exception as e:
                print(f"⚠️ Failed to load cache: {e}")
                return None
        return None
    
    def _save_cached_pattern(self, url: str, pattern: PatternAnalysis):
        """Save pattern analysis to cache."""
        cache_file = self._get_cache_file_path(url)
        try:
            with open(cache_file, 'w', encoding='utf-8') as f:
                json.dump(pattern.model_dump(), f, indent=2, ensure_ascii=False)
                print(f"💾 Pattern cached to: {cache_file}")
        except Exception as e:
            print(f"⚠️ Failed to save cache: {e}")
    
    def analyze_html_structure(self, soup: BeautifulSoup, base_url: str) -> Dict[str, Any]:
        """Analyze HTML structure to identify patterns."""
        all_links = soup.find_all('a', href=True)
        
        link_contexts = []
        for link in all_links:
            href = link.get('href')
            text = link.get_text(strip=True)
            
            if not href or not text or len(text) < 3:
                continue
                
            parent = link.parent
            parent_class = ' '.join(parent.get('class', [])) if parent else ''
            
            full_url = urljoin(base_url, href)
            url_parts = href.strip('/').split('/')
            
            link_contexts.append({
                'text': text,
                'href': href,
                'full_url': full_url,
                'text_length': len(text),
                'parent_tag': parent.name if parent else 'unknown',
                'parent_class': parent_class,
                'url_parts': url_parts,
                'url_depth': len(url_parts)
            })
        
        # Find common patterns
        parent_patterns = Counter()
        url_patterns = Counter()
        
        for ctx in link_contexts:
            if ctx['parent_class']:
                parent_patterns[f"{ctx['parent_tag']}.{ctx['parent_class']}"] += 1
            else:
                parent_patterns[ctx['parent_tag']] += 1
            
            if len(ctx['url_parts']) >= 2:
                url_patterns[ctx['url_parts'][0]] += 1
        
        return {
            'total_links': len(all_links),
            'analyzed_links': len(link_contexts),
            'link_contexts': link_contexts,
            'parent_patterns': dict(parent_patterns.most_common(10)),
            'url_patterns': dict(url_patterns.most_common(10)),
            'avg_text_length': sum(ctx['text_length'] for ctx in link_contexts) / len(link_contexts) if link_contexts else 0
        }
    
    async def analyze_patterns_with_llm(self, base_url: str, html_content: str, structure_analysis: Dict, force_regenerate: bool = False) -> PatternAnalysis:
        """Use LLM to identify content patterns."""
        
        # Try cache first unless forced to regenerate
        if not force_regenerate:
            cached_pattern = self._load_cached_pattern(base_url)
            if cached_pattern:
                return cached_pattern
        
        print("🧠 Analyzing patterns with LLM...")
        
        system_prompt = """You are an expert at identifying repeating content patterns on web pages. Find lists of similar content items like blog posts, articles, job listings, products, news items, etc.

Focus on identifying the PRIMARY pattern that represents the main content list on the page.

Also identify CSS selectors for publication dates:
- <time> elements with datetime attributes
- Elements with classes containing 'date', 'time', 'published', 'created'
- Date patterns like "2024-01-15", "Jan 15, 2024", "15 January 2024"
- Date information in parent/sibling elements of the main content links"""

        # Prepare context
        context_summary = {
            'url': base_url,
            'total_links': structure_analysis['total_links'],
            'top_parent_patterns': list(structure_analysis['parent_patterns'].keys())[:5],
            'top_url_patterns': list(structure_analysis['url_patterns'].keys())[:5],
            'sample_links': [
                {
                    'text': ctx['text'][:50],
                    'href': ctx['href'],
                    'parent': f"{ctx['parent_tag']}.{ctx['parent_class']}" if ctx['parent_class'] else ctx['parent_tag']
                }
                for ctx in structure_analysis['link_contexts'][:10]
            ]
        }
        
        user_prompt = f"""Analyze this webpage to identify the main repeating content pattern.

URL: {base_url}

STRUCTURE ANALYSIS:
- Total links: {context_summary['total_links']}
- Most common parent patterns: {context_summary['top_parent_patterns']}
- Most common URL patterns: {context_summary['top_url_patterns']}

SAMPLE LINKS:
{json.dumps(context_summary['sample_links'], indent=2)}

HTML CONTENT (first 6000 chars):
{html_content[:6000]}

Identify:
1. Content pattern type (blog_posts, job_listings, articles, etc.)
2. CSS selectors for main content links
3. URL regex pattern for content
4. Text indicators for content vs navigation
5. CSS selectors for publication dates
6. Confidence score (0.0-1.0)"""

        try:
            response = self.client.chat.completions.parse(
                model="gpt-4o-2024-08-06",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                response_format=PatternAnalysis
            )
            
            result = response.choices[0].message.parsed
            print(f"✅ Pattern identified: {result.pattern_type} (confidence: {result.confidence_score})")
            
            # Save to cache
            self._save_cached_pattern(base_url, result)
            return result
            
        except Exception as e:
            print(f"❌ LLM analysis failed: {e}")
            return PatternAnalysis(
                pattern_type="unknown",
                primary_selectors=["a[href]"],
                fallback_selectors=["article a", ".post a", ".item a"],
                url_pattern_regex=".*",
                content_indicators=[".*"],
                skip_patterns=["about", "contact", "home"],
                confidence_score=0.1,
                pattern_description="Fallback pattern - all links",
                estimated_items=0,
                date_selectors=["time", ".date", ".published", "[datetime]"]
            )
    
    def extract_using_pattern(self, soup: BeautifulSoup, base_url: str, pattern: PatternAnalysis) -> List[Dict[str, Any]]:
        """Extract links using the identified pattern."""
        extracted_links = []
        selectors_to_try = pattern.primary_selectors + pattern.fallback_selectors
        
        for selector in selectors_to_try:
            try:
                links = soup.select(selector)
                
                for element in links:
                    # Handle both direct links and elements containing links
                    if element.name == 'a':
                        link = element
                        container = element.parent
                    else:
                        link = element.find('a', href=True)
                        if not link:
                            continue
                        container = element
                    
                    href = link.get('href')
                    text = link.get_text(strip=True)
                    
                    if not href or not text:
                        continue
                    
                    full_url = urljoin(base_url, href)
                    
                    # Validate against pattern
                    if self._matches_pattern(href, text, full_url, pattern):
                        # Extract publication date
                        pub_date = self._extract_publication_date(container, pattern.date_selectors)
                        
                        link_data = {
                            'url': full_url,
                            'title': text,
                            'selector_used': selector
                        }
                        
                        if pub_date:
                            link_data['publication_date'] = pub_date
                        
                        extracted_links.append(link_data)
                
                # If primary selectors worked, use them
                if extracted_links and selector in pattern.primary_selectors:
                    break
                    
            except Exception as e:
                continue
        
        # Remove duplicates
        unique_links = []
        seen_urls = set()
        
        for link in extracted_links:
            if link['url'] not in seen_urls:
                seen_urls.add(link['url'])
                unique_links.append(link)
        
        return unique_links
    
    def _matches_pattern(self, href: str, text: str, full_url: str, pattern: PatternAnalysis) -> bool:
        """Check if a link matches the identified pattern."""
        
        # Check skip patterns
        for skip_pattern in pattern.skip_patterns:
            if re.search(skip_pattern, href, re.IGNORECASE) or re.search(skip_pattern, text, re.IGNORECASE):
                return False
        
        # Basic quality checks
        if len(text.strip()) < 3:
            return False
            
        # Skip obvious navigation
        nav_patterns = [
            r'^(home|about|contact|privacy|terms)$',
            r'^(next|prev|previous|more)$',
            r'^\d+$',
            r'^(←|→|>>|<<)$'
        ]
        
        for nav_pattern in nav_patterns:
            if re.search(nav_pattern, text, re.IGNORECASE):
                return False
        
        return True
    
    def _extract_publication_date(self, container: BeautifulSoup, date_selectors: List[str]) -> Optional[str]:
        """Extract publication date from container element."""
        if not container or not date_selectors:
            return None
        
        # Try each date selector
        for selector in date_selectors:
            try:
                date_elements = container.select(selector)
                
                if not date_elements:
                    # Try fallback selectors
                    general_selectors = ['time', '[datetime]', '.date', '.published', '.created']
                    for gen_sel in general_selectors:
                        date_elements = container.select(gen_sel)
                        if date_elements:
                            break
                
                for date_element in date_elements:
                    # Check datetime attribute
                    datetime_attr = date_element.get('datetime')
                    if datetime_attr:
                        return self._standardize_date(datetime_attr)
                    
                    # Check element text
                    date_text = date_element.get_text(strip=True)
                    if date_text and self._is_valid_date_text(date_text):
                        return self._standardize_date(date_text)
                    
                    # For time elements
                    if date_element.name == 'time' and date_text:
                        return self._standardize_date(date_text)
                        
            except Exception:
                continue
        
        # Fallback: search container text for date patterns
        return self._extract_date_from_text(container.get_text())
    
    def _extract_date_from_text(self, text: str) -> Optional[str]:
        """Extract date from text using regex patterns."""
        if not text:
            return None
        
        date_patterns = [
            r'\b\d{4}-\d{2}-\d{2}\b',  # 2024-01-15
            r'\b\d{2}-\d{2}\b',  # 06-17
            r'\b\d{1,2}-\d{1,2}\b',  # 6-17
            r'\b\d{2}/\d{2}/\d{4}\b',  # 01/15/2024
            r'\b\d{1,2}/\d{1,2}/\d{4}\b',  # 1/15/2024
            r'\b\d{1,2}\s+\w+\s+\d{4}\b',  # 15 January 2024
            r'\b\w+\s+\d{1,2},\s+\d{4}\b',  # January 15, 2024
            r'\b\w{3}\s+\d{1,2},\s+\d{4}\b',  # Jan 15, 2024
        ]
        
        for pattern in date_patterns:
            match = re.search(pattern, text)
            if match:
                found_date = match.group(0)
                return self._standardize_date(found_date)
        
        return None
    
    def _is_valid_date_text(self, text: str) -> bool:
        """Check if text looks like a valid date."""
        if not text or len(text) < 2:
            return False
        
        date_patterns = [
            r'\b\d{4}-\d{2}-\d{2}\b',
            r'\b\d{2}-\d{2}\b',
            r'\b\d{1,2}-\d{1,2}\b',
            r'\b\d{2}/\d{2}/\d{4}\b',
            r'\b\d{1,2}/\d{1,2}/\d{4}\b',
            r'\b\d{1,2}\s+\w+\s+\d{4}\b',
            r'\b\w+\s+\d{1,2},\s+\d{4}\b',
            r'\b\w{3}\s+\d{1,2},\s+\d{4}\b',
            r'\b\d{4}\b',
        ]
        
        return any(re.search(pattern, text) for pattern in date_patterns)
    
    def _standardize_date(self, date_str: str) -> Optional[str]:
        """Standardize date to YYYY-MM-DD format."""
        if not date_str:
            return None
        
        current_year = datetime.now().year
        
        try:
            # Already ISO format
            if re.match(r'\b\d{4}-\d{2}-\d{2}\b', date_str):
                return date_str
            
            # MM-DD format
            if re.match(r'\b\d{2}-\d{2}\b', date_str):
                month, day = date_str.split('-')
                return f"{current_year}-{month}-{day}"
            
            # M-DD format
            if re.match(r'\b\d{1,2}-\d{1,2}\b', date_str):
                month, day = date_str.split('-')
                return f"{current_year}-{month.zfill(2)}-{day.zfill(2)}"
            
            # MM/DD/YYYY format
            if re.match(r'\b\d{1,2}/\d{1,2}/\d{4}\b', date_str):
                parts = date_str.split('/')
                month, day, year = parts[0], parts[1], parts[2]
                return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
            
            # Month DD, YYYY format
            month_match = re.match(r'\b(\w+)\s+(\d{1,2}),\s+(\d{4})\b', date_str)
            if month_match:
                month_name, day, year = month_match.groups()
                month_num = self._month_name_to_number(month_name)
                if month_num:
                    return f"{year}-{month_num.zfill(2)}-{day.zfill(2)}"
            
            # DD Month YYYY format
            day_month_match = re.match(r'\b(\d{1,2})\s+(\w+)\s+(\d{4})\b', date_str)
            if day_month_match:
                day, month_name, year = day_month_match.groups()
                month_num = self._month_name_to_number(month_name)
                if month_num:
                    return f"{year}-{month_num.zfill(2)}-{day.zfill(2)}"
            
            # Mon DD, YYYY format
            short_month_match = re.match(r'\b(\w{3})\s+(\d{1,2}),\s+(\d{4})\b', date_str)
            if short_month_match:
                month_abbr, day, year = short_month_match.groups()
                month_num = self._month_abbr_to_number(month_abbr)
                if month_num:
                    return f"{year}-{month_num.zfill(2)}-{day.zfill(2)}"
            
            return date_str
            
        except Exception:
            return date_str
    
    def _month_name_to_number(self, month_name: str) -> Optional[str]:
        """Convert full month name to number."""
        months = {
            'january': '01', 'february': '02', 'march': '03', 'april': '04',
            'may': '05', 'june': '06', 'july': '07', 'august': '08',
            'september': '09', 'october': '10', 'november': '11', 'december': '12'
        }
        return months.get(month_name.lower())
    
    def _month_abbr_to_number(self, month_abbr: str) -> Optional[str]:
        """Convert abbreviated month name to number."""
        months = {
            'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04',
            'may': '05', 'jun': '06', 'jul': '07', 'aug': '08',
            'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'
        }
        return months.get(month_abbr.lower())
    
    async def extract_pattern_links(self, url: str, force_regenerate: bool = False) -> ExtractedPattern:
        """Main method to extract patterned links from any webpage."""
        
        print(f"🌐 Extracting from: {url}")
        
        # Extract HTML
        html_result = await self.extractor.extract_html(url)
        if not html_result:
            return self._empty_result(url, "HTML extraction failed")
        
        soup = BeautifulSoup(html_result, 'html.parser')
        title = soup.find('title')
        print(f"📄 Page: {title.get_text() if title else 'No title'}")
        
        # Analyze structure
        structure_analysis = self.analyze_html_structure(soup, url)
        
        # Get patterns (cached or LLM)
        try:
            pattern = await self.analyze_patterns_with_llm(url, html_result, structure_analysis, force_regenerate)
        except Exception as e:
            return self._empty_result(url, f"Pattern analysis failed: {e}")
        
        # Extract using pattern
        try:
            links = self.extract_using_pattern(soup, url, pattern)
        except Exception as e:
            return self._empty_result(url, f"Extraction failed: {e}")
        
        result = ExtractedPattern(
            links=links,
            total_found=len(links),
            pattern_used=pattern.pattern_type,
            confidence=pattern.confidence_score,
            base_url=url,
            pattern_analysis=pattern
        )
        
        print(f"✅ Found {len(links)} {pattern.pattern_type} items")
        return result
    
    def _empty_result(self, url: str, reason: str) -> ExtractedPattern:
        """Return empty result with error info."""
        return ExtractedPattern(
            links=[],
            total_found=0,
            pattern_used="failed",
            confidence=0.0,
            base_url=url,
            pattern_analysis=PatternAnalysis(
                pattern_type="error",
                primary_selectors=[],
                fallback_selectors=[],
                url_pattern_regex="",
                content_indicators=[],
                skip_patterns=[],
                confidence_score=0.0,
                pattern_description=reason,
                estimated_items=0,
                date_selectors=[]
            )
        )
    
    def to_json(self, result: ExtractedPattern) -> Dict[str, Any]:
        """Convert result to JSON format with url, title, and publication_date."""
        clean_items = []
        for item in result.links:
            clean_item = {
                "url": item["url"],
                "title": item["title"],
                "publication_date": item.get("publication_date", "")
            }
            clean_items.append(clean_item)
        
        return {
            "extraction_info": {
                "base_url": result.base_url,
                "total_items_found": result.total_found,
                "pattern_type": result.pattern_used,
                "confidence_score": result.confidence
            },
            "links": clean_items
        }

# Simple extraction function
async def extract_pattern_links(url: str, force_regenerate: bool = False) -> Dict[str, Any]:
    """Extract patterned links and return JSON directly."""
    extractor = UniversalPatternExtractor()
    result = await extractor.extract_pattern_links(url, force_regenerate)
    return extractor.to_json(result)

# Example usage
url1 = "https://www.ssp.sh/posts/"
url2 = "https://www.joanwestenberg.com/"

json_result1 = await extract_pattern_links(url2, force_regenerate=True)

print(json.dumps(json_result1, indent=2))

🌐 Extracting from: https://www.joanwestenberg.com/
📄 Page: Westenberg
🧠 Analyzing patterns with LLM...
✅ Pattern identified: articles (confidence: 0.85)
💾 Pattern cached to: pattern_cache/pattern_www.joanwestenberg.com_be32fae3.json
✅ Found 9 articles items
{
  "extraction_info": {
    "base_url": "https://www.joanwestenberg.com/",
    "total_items_found": 9,
    "pattern_type": "articles",
    "confidence_score": 0.85
  },
  "links": [
    {
      "url": "https://www.joanwestenberg.com/p/the-cannae-problem",
      "title": "May 2, 2025\u202216 min readThe Cannae Problem",
      "publication_date": "2025-05-02"
    },
    {
      "url": "https://www.joanwestenberg.com/p/the-cult-of-hard-mode-why-simplicity-offends-tech-elites",
      "title": "Jun 12, 2025\u20228 min readThe Cult of Hard Mode",
      "publication_date": "2025-06-12"
    },
    {
      "url": "https://www.joanwestenberg.com/p/i-miss-the-internet",
      "title": "Nov 12, 2024\u202210 min readI Miss the Internet",
      