In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import re
from typing import List, Dict, Optional
import html

class EDHRECScraper:
    def __init__(self, delay: float = 1.0):
        self.base_url = "https://edhrec.com"
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
    def get_commanders_list(self, limit: Optional[int] = None) -> List[Dict[str, str]]:
        """Get list of commanders from EDHREC using proper HTML parsing"""
        print("Fetching commanders list...")
        
        url = f"{self.base_url}/commanders"
        
        try:
            response = self.session.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            commander_elements = soup.find_all('span', class_='Card_name__Mpa7S')
            
            commanders = []
            for element in commander_elements:
                commander_name = html.unescape(element.get_text().strip())
                
                if not commander_name:
                    continue
                
                slug = self.name_to_slug(commander_name)
                commanders.append({
                    'name': commander_name,
                    'slug': slug,
                    'url': f"{self.base_url}/commanders/{slug}"
                })
            
            # Remove duplicates
            seen = set()
            unique_commanders = []
            for commander in commanders:
                if commander['name'] not in seen:
                    seen.add(commander['name'])
                    unique_commanders.append(commander)
            
            if limit:
                unique_commanders = unique_commanders[:limit]
            
            print(f"Found {len(unique_commanders)} unique commanders")
            return unique_commanders
            
        except Exception as e:
            print(f"Error fetching commanders list: {e}")
            return []

    def get_commanders_from_slugs(self, slugs: List[str]) -> List[Dict[str, str]]:
        """Convert a list of commander slugs to commander dictionaries"""
        commanders = []
        
        for slug in slugs:
            # Convert slug back to a readable name (basic conversion)
            name = slug.replace('-', ' ').title()
            
            commander = {
                'name': name,
                'slug': slug,
                'url': f"{self.base_url}/commanders/{slug}"
            }
            commanders.append(commander)
        
        print(f"Created {len(commanders)} commanders from slugs")
        return commanders
    
    def name_to_slug(self, name: str) -> str:
        """Convert commander name to URL slug format"""
        slug = name.lower()
        slug = slug.replace("'", "")
        slug = slug.replace(",", "") 
        slug = slug.replace("//", "-")
        slug = slug.replace("&", "and")
        slug = re.sub(r'[^\w\s-]', '', slug)
        slug = re.sub(r'\s+', '-', slug)
        slug = re.sub(r'-+', '-', slug)
        return slug.strip('-')
    
    def get_section_from_text(self, text: str) -> Optional[str]:
        """Extract section name from text, focusing on known EDHREC patterns"""
        text = text.strip().lower()
        
        # Skip empty or very long text
        if not text or len(text) > 50:
            return None
            
        # Common EDHREC section patterns - order matters, check more specific first
        section_patterns = {
            'new_cards': ['new cards'],
            'high_synergy_cards': ['high synergy cards', 'high synergy'],
            'top_cards': ['top cards'],
            'game_changers': ['game changers'],
            'battles': ['battles'],
            'planeswalkers': ['planeswalkers'],
            'creatures': ['creatures'],
            'instants': ['instants'],
            'sorceries': ['sorceries'],
            'artifacts': ['artifacts'],
            'enchantments': ['enchantments'],
            'lands': ['lands'],
            'mana_rocks': ['mana rocks'],
            'ramp': ['ramp'],
            'card_draw': ['card draw'],
            'removal': ['removal'],
            'board_wipes': ['board wipes'],
            'counterspells': ['counterspells'],
            'protection': ['protection'],
            'recursion': ['recursion'],
            'utility': ['utility'],
            'win_conditions': ['win conditions', 'wincons'],
            'budget_cuts': ['budget cuts'],
            'expensive_cuts': ['expensive cuts']
        }
        
        for section_key, patterns in section_patterns.items():
            if any(pattern in text for pattern in patterns):
                return section_key
        
        return None
    
    def find_current_section(self, card_container) -> Optional[str]:
        """Find the current section by looking for section headers in the page structure"""
        
        # Strategy 1: Look for the section header that appears before this card in the page
        # Get the root of the page
        root = card_container
        while root.parent:
            root = root.parent
        
        # Find all potential section headers in the page
        all_headers = root.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        
        # Convert the page to string to find positions
        try:
            page_html = str(root)
            card_html = str(card_container)
            
            # Find the position of our card in the page
            card_start = card_html[:200]  # Use first 200 chars as identifier
            card_position = page_html.find(card_start)
            
            if card_position == -1:
                # Fallback: try with a shorter identifier
                card_start = card_html[:100]
                card_position = page_html.find(card_start)
            
            if card_position != -1:
                # Find the section header that appears closest before our card
                best_section = None
                best_distance = float('inf')
                
                for header in all_headers:
                    header_html = str(header)
                    header_position = page_html.find(header_html)
                    
                    # Only consider headers that appear before our card
                    if header_position != -1 and header_position < card_position:
                        distance = card_position - header_position
                        
                        # Check if this header contains a section name
                        section = self.get_section_from_text(header.get_text())
                        
                        if section and distance < best_distance:
                            best_section = section
                            best_distance = distance
                
                if best_section:
                    return best_section
        
        except Exception:
            pass
        
        # Strategy 2: Look backwards through siblings and parents for headers
        current = card_container
        for _ in range(15):  # Search up through parents
            if current is None:
                break
            
            # Check previous siblings for headers
            prev_sibling = current.previous_sibling
            while prev_sibling:
                if hasattr(prev_sibling, 'get_text'):
                    # Check if this is a header element
                    if prev_sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                        section = self.get_section_from_text(prev_sibling.get_text())
                        if section:
                            return section
                    
                    # Check if this element contains a header
                    headers = prev_sibling.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
                    for header in reversed(headers):  # Check most recent first
                        section = self.get_section_from_text(header.get_text())
                        if section:
                            return section
                
                prev_sibling = prev_sibling.previous_sibling
            
            # Move up to parent
            current = current.parent
        
        # Strategy 3: Look for section indicators in parent elements
        current = card_container
        for _ in range(10):
            if current is None:
                break
            
            # Check for ID attributes that might indicate sections
            if current.get('id'):
                section = self.get_section_from_text(current.get('id'))
                if section:
                    return section
            
            # Check for class names that might indicate sections
            if current.get('class'):
                class_text = ' '.join(current.get('class'))
                section = self.get_section_from_text(class_text)
                if section:
                    return section
            
            current = current.parent
        
        return 'unknown'
    
    def scrape_commander_page(self, commander_url: str) -> Optional[Dict]:
        """Scrape all card data from a single commander page"""
        try:
            response = self.session.get(commander_url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract commander name from URL
            slug = commander_url.split('/commanders/')[-1]
            commander_name = slug.replace('-', ' ').title()
            
            # Initialize data structure for all cards
            commander_data = {
                'commander_name': commander_name,
                'commander_slug': slug,
                'url': commander_url,
                'cards': []
            }
            
            # Find all card containers
            card_containers = soup.find_all('div', class_='Card_container__Ng56K')
            print(f"    Found {len(card_containers)} card containers")
            
            for container in card_containers:
                # Extract card name
                name_elem = container.find('span', class_='Card_name__Mpa7S')
                if not name_elem:
                    continue
                    
                card_name = html.unescape(name_elem.get_text().strip())
                
                # Extract percentage and synergy data
                label_elem = container.find('div', class_='CardLabel_label__iAM7T')
                if not label_elem:
                    continue
                
                label_text = label_elem.get_text()
                
                # Parse percentage: "63% of 36026 decks"
                pct_match = re.search(r'(\d+)% of (\d+) decks', label_text)
                if not pct_match:
                    continue
                
                inclusion_pct = int(pct_match.group(1))
                total_decks = int(pct_match.group(2))
                
                # Parse synergy: "+25% synergy" or "-5% synergy"
                syn_match = re.search(r'([+-]?\d+)%[^%]*synergy', label_text)
                synergy_pct = int(syn_match.group(1)) if syn_match else 0
                
                # Determine section
                section = self.find_current_section(container)
                
                card_info = {
                    'name': card_name,
                    'section': section,
                    'inclusion_percentage': inclusion_pct,
                    'total_decks': total_decks,
                    'synergy_percentage': synergy_pct
                }
                
                commander_data['cards'].append(card_info)
            
            # Show section breakdown
            sections = {}
            for card in commander_data['cards']:
                section = card['section']
                sections[section] = sections.get(section, 0) + 1
            
            print(f"    Cards by section: {dict(sorted(sections.items()))}")
            print(f"    Total cards extracted: {len(commander_data['cards'])}")
            
            return commander_data
            
        except Exception as e:
            print(f"Error scraping {commander_url}: {e}")
            return None
    
    def scrape_multiple_commanders(self, commanders: List[Dict[str, str]]) -> List[Dict]:
        """Scrape data for multiple commanders"""
        all_data = []
        
        for i, commander in enumerate(commanders, 1):
            print(f"Scraping {i}/{len(commanders)}: {commander['name']}")
            
            data = self.scrape_commander_page(commander['url'])
            if data:
                all_data.append(data)
                
                # Show sample of what we found
                cards_count = len(data['cards'])
                print(f"  ✓ Found {cards_count} cards total")
                
                # Show top 3 synergy cards as sample
                if data['cards']:
                    sorted_cards = sorted(data['cards'], key=lambda x: x['synergy_percentage'], reverse=True)
                    print("    Top synergy cards:")
                    for card in sorted_cards[:3]:
                        print(f"      - {card['name']} ({card['section']}): {card['synergy_percentage']:+d}% synergy")
                
            else:
                print(f"  ✗ Failed to scrape {commander['name']}")
            
            # Be respectful with delays
            if i < len(commanders):
                time.sleep(self.delay)
            
            # Save progress periodically
            if i % 250 == 0:  # Save every 50 commanders
                self.save_data(all_data, f'edhrec_backup_{i}.json')
                print(f"  💾 Saved backup at {i} commanders")
                
                # Show progress update
                percentage = (i / len(commanders)) * 100
                print(f"  📊 Progress: {percentage:.1f}% complete")
        
        return all_data
    
    def save_data(self, data: List[Dict], filename: str):
        """Save scraped data to JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
    
    def to_dataframe(self, data: List[Dict]) -> pd.DataFrame:
        """Convert scraped card data to pandas DataFrame"""
        rows = []
        
        for commander_data in data:
            commander_name = commander_data['commander_name']
            
            # Process all cards
            for card in commander_data.get('cards', []):
                rows.append({
                    'commander': commander_name,
                    'commander_slug': commander_data['commander_slug'],
                    'card_name': card['name'],
                    'card_section': card['section'],
                    'synergy_percentage': card['synergy_percentage'],
                    'inclusion_percentage': card['inclusion_percentage'],
                    'total_decks': card['total_decks']
                })
        
        return pd.DataFrame(rows)

def run_complete_scrape_from_slugs(slugs: List[str]):
    """Run the complete scraper for provided commander slugs - all cards"""
    print("=" * 60)
    print("EDHREC COMPLETE SCRAPER - ALL CARDS FROM SLUGS")
    print("=" * 60)
    
    scraper = EDHRECScraper(delay=1.0)  # 1 second delay to be respectful
    
    # Create commanders list from slugs
    commanders = scraper.get_commanders_from_slugs(slugs)
    
    if not commanders:
        print("No commanders to process.")
        return None
    
    print(f"\n🎯 Will scrape ALL CARDS for {len(commanders)} commanders")
    print(f"⏱️  Estimated time: {len(commanders) * 2 / 60:.1f} minutes")
    print(f"💾 Progress will be saved every 50 commanders")
    
    # Show first 5 commanders as sample
    print(f"\n📋 First 5 commanders to scrape:")
    for i, commander in enumerate(commanders[:5], 1):
        print(f"  {i}. {commander['name']} ({commander['slug']})")
    
    if len(commanders) > 5:
        print(f"  ... and {len(commanders) - 5} more")
    
    # Scrape all commanders
    print(f"\n{'='*40}")
    print("SCRAPING ALL CARDS")
    print(f"{'='*40}")
    
    data = scraper.scrape_multiple_commanders(commanders)
    
    if not data:
        print("No data scraped.")
        return None
    
    # Process results
    print(f"\n{'='*40}")
    print("PROCESSING RESULTS")
    print(f"{'='*40}")
    
    # Save raw data
    scraper.save_data(data, 'TEST edhrec_complete_from_slugs.json')
    print("✓ Saved raw data to edhrec_complete_from_slugs.json")
    
    # Create DataFrame
    df = scraper.to_dataframe(data)
    if len(df) > 0:
        df.to_csv('TEST edhrec_complete_from_slugs.csv', index=False)
        print("✓ Saved DataFrame to edhrec_complete_from_slugs.csv")
        
        # Show comprehensive statistics
        print(f"\n📊 FINAL STATISTICS:")
        print(f"  • Commanders scraped: {len(data)}")
        print(f"  • Total card entries: {len(df):,}")
        print(f"  • Unique cards: {df['card_name'].nunique():,}")
        print(f"  • Unique sections: {df['card_section'].nunique()}")
        print(f"  • Average synergy: {df['synergy_percentage'].mean():.1f}%")
        print(f"  • Highest synergy: {df['synergy_percentage'].max()}%")
        print(f"  • Lowest synergy: {df['synergy_percentage'].min()}%")
        
        # Show section breakdown
        print(f"\n📂 CARDS BY SECTION:")
        section_counts = df['card_section'].value_counts()
        for section, count in section_counts.items():
            print(f"  {section:<20}: {count:,} cards")
        
        # Show top synergy cards across all commanders
        print(f"\n🔥 TOP 15 SYNERGY CARDS:")
        top_cards = df.nlargest(15, 'synergy_percentage')
        for i, (_, row) in enumerate(top_cards.iterrows(), 1):
            print(f"  {i:2d}. {row['card_name']:<30} ({row['card_section']:<15}) - {row['commander']:<20}: {row['synergy_percentage']:+3d}%")
        
        # Show most popular cards
        print(f"\n📈 MOST POPULAR CARDS (HIGHEST INCLUSION):")
        popular_cards = df.nlargest(10, 'inclusion_percentage')
        for i, (_, row) in enumerate(popular_cards.iterrows(), 1):
            print(f"  {i:2d}. {row['card_name']:<30} ({row['card_section']:<15}): {row['inclusion_percentage']:3d}%")
        
        # Show commanders with most cards
        print(f"\n👑 COMMANDERS WITH MOST CARDS:")
        commander_counts = df['commander'].value_counts().head(10)
        for commander, count in commander_counts.items():
            print(f"  {commander:<35}: {count:3d} cards")
        
        # Show versatile cards (appear in multiple commanders)
        print(f"\n🔄 MOST VERSATILE CARDS (appear in multiple commanders):")
        card_commanders = df.groupby('card_name')['commander'].nunique().sort_values(ascending=False)
        versatile_cards = card_commanders.head(10)
        for card, commander_count in versatile_cards.items():
            if commander_count > 1:
                avg_synergy = df[df['card_name'] == card]['synergy_percentage'].mean()
                most_common_section = df[df['card_name'] == card]['card_section'].mode().iloc[0]
                print(f"  {card:<30}: {commander_count:2d} commanders ({most_common_section:<12}) avg synergy: {avg_synergy:+4.1f}%")
        
        # Show section-specific top cards
        print(f"\n🏆 TOP CARDS BY SECTION:")
        for section in ['creatures', 'instants', 'sorceries', 'artifacts', 'enchantments', 'lands']:
            section_cards = df[df['card_section'] == section]
            if len(section_cards) > 0:
                top_card = section_cards.nlargest(1, 'synergy_percentage').iloc[0]
                print(f"  {section.capitalize():<15}: {top_card['card_name']:<25} ({top_card['synergy_percentage']:+3d}% synergy)")
        
    else:
        print("No card data found.")
    
    print(f"\n✅ Complete scraping finished successfully!")
    print(f"📁 Check these files for your data:")
    print(f"   • edhrec_complete_from_slugs.csv (spreadsheet format)")
    print(f"   • edhrec_complete_from_slugs.json (raw data)")
    
    return data

def load_slugs_from_file(filename: str) -> List[str]:
    """Load commander slugs from a text file (one per line)"""
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            slugs = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(slugs)} slugs from {filename}")
        return slugs
    except FileNotFoundError:
        print(f"Error: File {filename} not found")
        return []
    except Exception as e:
        print(f"Error loading slugs from file: {e}")
        return []

if __name__ == "__main__":
    # Example usage with your provided slugs
    
    # Option 1: Load slugs from a file
    # Create a file called 'commander_slugs.txt' with one slug per line
    slugs_from_file = load_slugs_from_file('commander_slugs.txt')
    
    if slugs_from_file:
        run_complete_scrape_from_slugs(slugs_from_file)
    else:
        # Option 2: Use slugs directly from your provided list
        example_slugs = [
            'the-ur-dragon',
            'atraxa-praetors-voice', 
            'edgar-markov',
            'krenko-mob-boss',
            'sauron-the-dark-lord'
        ]
        
        print("No slugs file found. Running with example slugs...")
        run_complete_scrape_from_slugs(example_slugs)

Error: File TEST commander_slugs_copy.txt not found
No slugs file found. Running with example slugs...
EDHREC COMPLETE SCRAPER - ALL CARDS FROM SLUGS
Created 5 commanders from slugs

🎯 Will scrape ALL CARDS for 5 commanders
⏱️  Estimated time: 0.2 minutes
💾 Progress will be saved every 50 commanders

📋 First 5 commanders to scrape:
  1. The Ur Dragon (the-ur-dragon)
  2. Atraxa Praetors Voice (atraxa-praetors-voice)
  3. Edgar Markov (edgar-markov)
  4. Krenko Mob Boss (krenko-mob-boss)
  5. Sauron The Dark Lord (sauron-the-dark-lord)

SCRAPING ALL CARDS
Scraping 1/5: The Ur Dragon
    Found 258 card containers
    Cards by section: {'artifacts': 32, 'battles': 1, 'creatures': 50, 'enchantments': 29, 'instants': 27, 'lands': 59, 'planeswalkers': 7, 'sorceries': 17, 'unknown': 35}
    Total cards extracted: 257
  ✓ Found 257 cards total
    Top synergy cards:
      - Miirym, Sentinel Wyrm (unknown): +69% synergy
      - Dragon Tempest (unknown): +68% synergy
      - Crux of Fate (unknow