In [9]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer

# Load our data
df_clean = pd.read_csv('data/mtg_cards_clean.csv', low_memory=False)

In [2]:
# Let's create a much more comprehensive keyword list combining what we found + manual additions
def create_comprehensive_keyword_list():
    """Create comprehensive 1-3 word MTG keyword list"""
    
    # Start with our discovered high-frequency phrases
    discovered_phrases = [
        'creature', 'this creature', 'you control', 'enters', 'damage', 'end of turn',
        'until end of turn', 'target creature', 'spell', 'counter', 'flying', 'graveyard',
        'artifact', 'damage to', 'sacrifice', 'token', 'when this creature', 'draw',
        'mana', 'creature enters', 'this creature enters', 'exile', 'your graveyard',
        'counter on', 'creature gets', 'your library', 'creature you', 'battlefield',
        'the battlefield', 'deals damage', 'beginning', 'beginning of', 'at the beginning',
        'draw card', 'your hand', 'this turn', 'return', 'creature you control',
        'deals damage to', 'card from', 'this spell', 'this card', 'cast this', 'combat'
    ]
    
    # Add essential combat abilities
    combat_abilities = [
        'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink', 'haste',
        'first strike', 'double strike', 'hexproof', 'indestructible', 'defender',
        'reach', 'menace', 'protection', 'prowess', 'flash', 'crew'
    ]
    
    # Add important multi-word combat phrases
    combat_phrases = [
        'first strike', 'double strike', 'combat damage', 'attacking creature',
        'blocking creature', 'deals combat damage', 'when attacks', 'when blocks',
        'whenever attacks', 'whenever blocks', 'attack alone', 'can attack',
        'must attack', 'cannot attack', 'can block', 'cannot block'
    ]
    
    # Add triggered ability patterns
    triggered_abilities = [
        'whenever', 'when enters', 'when dies', 'when attacks', 'when blocks',
        'at the beginning', 'at end of', 'whenever you cast', 'whenever you draw',
        'whenever deals damage', 'whenever takes damage', 'when you gain life',
        'when you lose life', 'whenever discards', 'whenever sacrifices'
    ]
    
    # Add important card type combinations
    card_types = [
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'basic land', 'nonbasic land', 'creature token', 'artifact token',
        'instant spell', 'sorcery spell', 'creature spell', 'noncreature spell'
    ]
    
    # Add mana and cost related phrases
    mana_phrases = [
        'mana cost', 'mana value', 'converted mana', 'additional cost',
        'without paying', 'costs less', 'costs more', 'add mana',
        'pay mana', 'spend mana', 'colorless mana', 'colored mana'
    ]
    
    # Add +1/+1 counter and proliferate synergies  
    counter_phrases = [
        'counter', 'counters', '+1/+1 counter', '-1/-1 counter', 'loyalty counter',
        'charge counter', 'time counter', 'proliferate', 'put counter',
        'remove counter', 'counter on it', 'with counters', 'number of counters'
    ]
    
    # Add tribal synergies
    tribal_keywords = [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'rogue', 'cleric', 'shaman', 'merfolk', 'insect', 'cat', 'dog', 'bird'
    ]
    
    # Add zone and library manipulation
    zone_phrases = [
        'from graveyard', 'to graveyard', 'from exile', 'to exile',
        'from your hand', 'to your hand', 'from library', 'search library',
        'shuffle library', 'top of library', 'bottom of library',
        'enters battlefield', 'leaves battlefield', 'return to battlefield'
    ]
    
    # Add sacrifice and destruction themes
    sacrifice_phrases = [
        'sacrifice', 'destroy', 'destroy target', 'sacrifice creature',
        'sacrifice artifact', 'sacrifice enchantment', 'sacrifice land',
        'when sacrificed', 'when destroyed', 'destroy all', 'sacrifice all'
    ]
    
    # Add draw and card advantage
    card_advantage = [
        'draw card', 'draw cards', 'draw additional', 'discard card',
        'discard cards', 'reveal card', 'reveal cards', 'look at',
        'search for', 'return to hand', 'mill cards', 'exile cards'
    ]
    
    # Combine all lists
    comprehensive_keywords = (
        discovered_phrases + combat_abilities + combat_phrases + 
        triggered_abilities + card_types + mana_phrases + 
        counter_phrases + tribal_keywords + zone_phrases + 
        sacrifice_phrases + card_advantage
    )
    
    # Remove duplicates and sort
    comprehensive_keywords = sorted(list(set(comprehensive_keywords)))
    
    return comprehensive_keywords

# Create the comprehensive keyword list
comprehensive_keywords = create_comprehensive_keyword_list()
print(f"Comprehensive keyword list: {len(comprehensive_keywords)} total keywords")

# Show categories
print("\n📝 Sample keywords by category:")
print("Combat:", [k for k in comprehensive_keywords if any(word in k for word in ['strike', 'combat', 'attack', 'block', 'flying', 'trample'])][:10])
print("Triggered:", [k for k in comprehensive_keywords if any(word in k for word in ['when', 'whenever', 'beginning', 'end'])][:8])
print("Counters:", [k for k in comprehensive_keywords if 'counter' in k][:8])
print("Tribal:", [k for k in comprehensive_keywords if k in ['human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel']])

print(f"\n🎯 Total comprehensive keywords: {len(comprehensive_keywords)}")

Comprehensive keyword list: 182 total keywords

📝 Sample keywords by category:
Combat: ['attack alone', 'attacking creature', 'blocking creature', 'can attack', 'can block', 'cannot attack', 'cannot block', 'combat', 'combat damage', 'deals combat damage']
Triggered: ['at end of', 'at the beginning', 'beginning', 'beginning of', 'defender', 'end of turn', 'legendary artifact', 'legendary creature']
Counters: ['+1/+1 counter', '-1/-1 counter', 'charge counter', 'counter', 'counter on', 'counter on it', 'counters', 'loyalty counter']
Tribal: ['angel', 'dragon', 'elf', 'goblin', 'human', 'vampire', 'zombie']

🎯 Total comprehensive keywords: 182


In [3]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer

# Load our data
df_clean = pd.read_csv('data/mtg_cards_clean.csv', low_memory=False)

def extract_mtg_ngrams(df, min_frequency=20):
    """Extract 1-3 word MTG phrases that appear frequently"""
    
    # Prepare documents (each card's oracle text as separate document)
    documents = []
    for oracle_text in df['oracle_text'].fillna(''):
        # Clean text for better phrase extraction
        cleaned = re.sub(r'\{[^}]*\}', '', oracle_text)  # Remove mana symbols
        cleaned = re.sub(r'[(),]', ' ', cleaned)         # Replace punctuation
        cleaned = re.sub(r'\s+', ' ', cleaned.strip())   # Normalize whitespace
        if cleaned:  # Only add non-empty documents
            documents.append(cleaned)
    
    print(f"Processing {len(documents)} card texts...")
    
    all_phrases = {}
    
    # Extract n-grams of different sizes
    for n in [1, 2, 3]:
        print(f"Extracting {n}-grams...")
        
        try:
            vectorizer = CountVectorizer(
                ngram_range=(n, n),
                lowercase=True,
                min_df=min_frequency,      # Must appear in at least this many cards
                max_df=0.8,                # But not in more than 80% of cards (too common)
                max_features=300           # Top 300 for each n-gram size
            )
            
            X = vectorizer.fit_transform(documents)
            feature_names = vectorizer.get_feature_names_out()
            
            # Sum frequencies across all documents
            frequencies = X.toarray().sum(axis=0)
            
            for phrase, freq in zip(feature_names, frequencies):
                if freq >= min_frequency:
                    all_phrases[phrase] = freq
                    
            print(f"  Found {len([f for f in frequencies if f >= min_frequency])} {n}-grams")
            
        except Exception as e:
            print(f"  Error extracting {n}-grams: {e}")
    
    return all_phrases

def filter_meaningful_phrases(phrases):
    """Filter for meaningful MTG phrases"""
    
    # Generic words/phrases to exclude
    exclude_words = {
        'the', 'and', 'or', 'of', 'to', 'in', 'on', 'at', 'by', 'for', 'with', 
        'from', 'up', 'about', 'into', 'through', 'a', 'an', 'this', 'that', 
        'these', 'those', 'it', 'its', 'you', 'your', 'each', 'all', 'any',
        'when', 'where', 'why', 'how', 'if', 'then', 'else', 'until', 'while',
        'may', 'can', 'could', 'would', 'should', 'must', 'will', 'shall',
        'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
        'put', 'get', 'add', 'has', 'have', 'had', 'been', 'being', 'do', 'does', 'did'
    }
    
    # MTG-relevant keywords that make phrases valuable
    mtg_keywords = {
        'creature', 'spell', 'mana', 'damage', 'combat', 'token', 'artifact', 
        'enchantment', 'battlefield', 'graveyard', 'exile', 'flying', 'trample', 
        'vigilance', 'deathtouch', 'lifelink', 'counter', 'destroy', 'sacrifice', 
        'draw', 'search', 'return', 'enters', 'dies', 'attacks', 'blocks', 'tap',
        'untap', 'equipped', 'legendary', 'planeswalker', 'instant', 'sorcery',
        'flash', 'reach', 'menace', 'indestructible', 'hexproof', 'proliferate'
    }
    
    filtered = {}
    
    for phrase, freq in phrases.items():
        words = phrase.split()
        
        # Skip if phrase is just generic words
        if all(word in exclude_words for word in words):
            continue
        
        # Keep if contains MTG keywords or is multi-word (usually more specific)
        has_mtg_content = (
            any(word in mtg_keywords for word in words) or
            len(words) > 1 or
            any(word.endswith('ing') for word in words)  # Actions like "attacking"
        )
        
        if has_mtg_content and len(phrase) > 2:
            filtered[phrase] = freq
    
    return filtered

# Extract phrases
print("Extracting MTG phrases (1-3 words)...")
all_phrases = extract_mtg_ngrams(df_clean, min_frequency=15)  # Lower threshold
print(f"Found {len(all_phrases)} total phrases")

# Filter for meaningful ones
meaningful_phrases = filter_meaningful_phrases(all_phrases)
print(f"Filtered to {len(meaningful_phrases)} meaningful MTG phrases")

# Show top phrases
if meaningful_phrases:
    sorted_phrases = sorted(meaningful_phrases.items(), key=lambda x: x[1], reverse=True)
    print(f"\nTop 50 MTG phrases:")
    print("="*70)
    for i, (phrase, freq) in enumerate(sorted_phrases[:50]):
        print(f"{i+1:2d}. {phrase:40} (appears {freq:4d} times)")
else:
    print("No meaningful phrases found - may need to adjust parameters")

Extracting MTG phrases (1-3 words)...
Processing 27290 card texts...
Extracting 1-grams...
  Found 300 1-grams
Extracting 2-grams...
  Found 300 2-grams
Extracting 3-grams...
  Found 300 3-grams
Found 900 total phrases
Filtered to 581 meaningful MTG phrases

Top 50 MTG phrases:
 1. creature                                 (appears 33052 times)
 2. this creature                            (appears 12952 times)
 3. you control                              (appears 7934 times)
 4. enters                                   (appears 6445 times)
 5. damage                                   (appears 5785 times)
 6. end of                                   (appears 5489 times)
 7. until end                                (appears 5284 times)
 8. until end of                             (appears 5284 times)
 9. of turn                                  (appears 5277 times)
10. end of turn                              (appears 5276 times)
11. target creature                          (appears 5275 

In [4]:
"""
Integration guide for enhancing your MTG Commander recommender with expanded keywords
"""

import re
import json
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

class EnhancedMTGFeatureExtractor:
    def __init__(self):
        # Load your existing keywords and add the new comprehensive set
        self.load_keyword_database()
        
    def load_keyword_database(self):
        """Load comprehensive keyword database with weights"""
        
        # Your existing keywords + new comprehensive set
        self.keyword_weights = {
            # Evergreen abilities
            'flying': 1.2, 'trample': 1.3, 'hexproof': 1.8, 'indestructible': 2.0,
            'deathtouch': 1.4, 'lifelink': 1.2, 'first strike': 1.2, 'double strike': 1.7,
            'vigilance': 1.1, 'haste': 1.1, 'reach': 1.0, 'menace': 1.2, 'ward': 1.5,
            
            # High-impact mechanics (Commander-focused)
            'partner': 3.0, 'eminence': 2.8, 'proliferate': 2.5, 'cascade': 2.5,
            'storm': 2.8, 'convoke': 1.8, 'delve': 1.6, 'affinity': 1.9,
            
            # ETB effects (very common in Commander)
            'enters the battlefield': 1.8, 'when this enters the battlefield': 1.8,
            'when enters the battlefield': 1.8, 'whenever a creature enters': 1.6,
            'enters the battlefield tapped': 1.2, 'enters the battlefield with': 1.5,
            
            # Combat damage triggers
            'deals combat damage': 1.6, 'combat damage to a player': 1.7,
            'combat damage to an opponent': 1.7, 'deals damage equal to its power': 1.5,
            'when this creature deals combat damage': 1.6,
            
            # Death and sacrifice synergies
            'when this creature dies': 1.6, 'whenever a creature dies': 1.7,
            'sacrifice a creature': 1.5, 'sacrifice an artifact': 1.3,
            'dies or is put into exile': 1.4, 'when dies': 1.6,
            
            # Counter synergies (huge in Commander)
            '+1/+1 counter': 2.0, '+1/+1 counters': 2.0, 'proliferate': 2.5,
            'loyalty counter': 1.8, 'experience counter': 2.2, 'poison counter': 2.0,
            'charge counter': 1.4, 'time counter': 1.3, 'energy counter': 1.5,
            
            # Spell synergies
            'whenever you cast a spell': 1.8, 'instant or sorcery': 1.6,
            'noncreature spell': 1.5, 'prowess': 1.6, 'magecraft': 1.7,
            'storm': 2.8, 'flashback': 1.5, 'buyback': 1.6,
            
            # Token synergies
            'create a token': 1.5, 'token creature': 1.4, 'populate': 1.8,
            'create a treasure token': 1.6, 'create a clue token': 1.3,
            
            # Mana and ramp
            'add mana': 1.4, 'mana of any color': 1.5, 'search for a basic land': 1.6,
            'landfall': 1.7, 'additional land drop': 1.8, 'ramp': 1.6,
            
            # Card advantage
            'draw a card': 1.7, 'draw two cards': 1.8, 'draw cards': 1.6,
            'scry': 1.2, 'surveil': 1.3, 'look at the top': 1.1,
            
            # Protection and evasion
            'can\'t be targeted': 1.8, 'can\'t be blocked': 1.6, 'can\'t be countered': 1.7,
            'protection from': 1.6, 'unblockable': 1.5, 'shadow': 1.4,
            
            # Graveyard synergies
            'return from your graveyard': 1.6, 'cast from your graveyard': 1.7,
            'graveyard to the battlefield': 1.8, 'mill': 1.4, 'dredge': 1.6,
            
            # Win conditions
            'you win the game': 3.0, 'commander damage': 2.5, 'poison counters': 2.2,
            'laboratory maniac': 2.5, 'thassa\'s oracle': 2.5,
            
            # Tribal support
            'choose a creature type': 1.8, 'shares a creature type': 1.6,
            'other': 1.2,  # for "other elves", "other vampires", etc.
            
            # Equipment and auras
            'equip': 1.4, 'equipped creature': 1.3, 'enchant creature': 1.3,
            'aura': 1.4, 'equipment': 1.4, 'attach': 1.2,
            
            # Planeswalker synergies
            'loyalty ability': 1.6, 'planeswalker': 1.5, 'loyalty counter': 1.8,
            
            # Common creature types (tribal)
            'human': 1.3, 'elf': 1.4, 'goblin': 1.4, 'zombie': 1.5, 'vampire': 1.4,
            'angel': 1.3, 'demon': 1.3, 'dragon': 1.5, 'wizard': 1.3, 'soldier': 1.2,
            
            # Activated abilities
            'tap:': 1.1, 'sacrifice:': 1.3, 'pay': 1.0, 'discard a card:': 1.2,
            
            # Turn timing
            'at the beginning of your upkeep': 1.4, 'at the beginning of each upkeep': 1.5,
            'at the beginning of your end step': 1.3, 'once each turn': 1.6,
            
            # Common effects (lower weight but still important)
            'target': 0.8, 'destroy': 1.2, 'exile': 1.3, 'counter': 1.4,
            'prevent': 1.1, 'redirect': 1.0, 'choose': 0.9
        }
        
        # Regex patterns for complex phrase matching
        self.regex_patterns = {
            'etb_creature': r'\bwhenever (?:a|another) creature enters the battlefield\b',
            'etb_self': r'\bwhen (?:this )?(?:creature )?enters the battlefield\b',
            'combat_damage': r'\bdeals combat damage to (?:a player|an opponent)\b',
            'creature_dies': r'\bwhenever (?:a|another) creature dies\b',
            'cast_spell': r'\bwhenever you cast (?:a |an )?(?:\w+ )?spell\b',
            'beginning_upkeep': r'\bat the beginning of (?:your|each) upkeep\b',
            'counter_type': r'\b(?:\+1/\+1|loyalty|charge|time|energy|experience|poison) counters?\b',
            'token_creation': r'\bcreate (?:a |an |two |three )?\w+ (?:\w+ )?tokens?\b',
            'tribal_other': r'\bother \w+s?\b',  # "other elves", "other goblins"
            'mana_cost_reduction': r'\bcosts? \{?\w+\}? less to cast\b',
            'protection_from': r'\bprotection from \w+\b'
        }
    
    def extract_features_with_regex(self, oracle_text: str) -> dict:
        """Extract features using both keyword matching and regex patterns"""
        if not oracle_text:
            return {}
        
        text_lower = oracle_text.lower()
        features = {}
        
        # Extract simple keyword matches
        for keyword, weight in self.keyword_weights.items():
            # Use word boundaries to avoid partial matches
            pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
            matches = len(re.findall(pattern, text_lower))
            if matches > 0:
                features[keyword] = matches * weight
        
        # Extract complex patterns with regex
        for pattern_name, pattern in self.regex_patterns.items():
            matches = len(re.findall(pattern, text_lower))
            if matches > 0:
                # Use pattern name as feature with default weight
                features[pattern_name] = matches * 1.5
        
        return features
    
    def create_enhanced_feature_vector(self, cards_df: pd.DataFrame) -> pd.DataFrame:
        """Create feature vectors for all cards with enhanced keyword extraction"""
        
        # Extract features for all cards
        all_features = []
        for idx, row in cards_df.iterrows():
            card_features = self.extract_features_with_regex(row.get('oracle_text', ''))
            card_features['card_name'] = row['name']
            all_features.append(card_features)
        
        # Convert to DataFrame
        features_df = pd.DataFrame(all_features).fillna(0)
        
        # Set card_name as index
        features_df = features_df.set_index('card_name')
        
        return features_df
    
    def calculate_commander_synergy_score(self, commander_features: dict, card_features: dict) -> float:
        """Calculate synergy score between commander and a potential card"""
        
        # Base cosine similarity
        commander_vector = list(commander_features.values())
        card_vector = [card_features.get(key, 0) for key in commander_features.keys()]
        
        if not commander_vector or not any(card_vector):
            return 0.0
        
        # Calculate cosine similarity
        dot_product = sum(a * b for a, b in zip(commander_vector, card_vector))
        magnitude_commander = sum(a * a for a in commander_vector) ** 0.5
        magnitude_card = sum(b * b for b in card_vector) ** 0.5
        
        if magnitude_commander == 0 or magnitude_card == 0:
            return 0.0
        
        base_similarity = dot_product / (magnitude_commander * magnitude_card)
        
        # Apply Commander-specific bonuses
        synergy_bonus = 0.0
        
        # Bonus for shared high-value mechanics
        high_value_shared = set(commander_features.keys()) & set(card_features.keys())
        for shared_keyword in high_value_shared:
            weight = self.keyword_weights.get(shared_keyword, 1.0)
            if weight >= 1.5:  # High-value mechanics
                synergy_bonus += 0.1 * weight
        
        # Bonus for complementary strategies
        if 'proliferate' in commander_features and any('+1/+1 counter' in k for k in card_features.keys()):
            synergy_bonus += 0.2
        
        if 'sacrifice' in str(commander_features) and 'dies' in str(card_features):
            synergy_bonus += 0.15
        
        if 'token' in str(commander_features) and 'create' in str(card_features):
            synergy_bonus += 0.1
        
        return min(1.0, base_similarity + synergy_bonus)
    
    def get_recommendations_for_commander(self, commander_name: str, commander_text: str, 
                                        cards_df: pd.DataFrame, num_recommendations: int = 10) -> list:
        """Get card recommendations for a specific commander"""
        
        # Extract commander features
        commander_features = self.extract_features_with_regex(commander_text)
        
        if not commander_features:
            return []
        
        # Calculate synergy scores for all cards
        recommendations = []
        
        for idx, row in cards_df.iterrows():
            if row['name'].lower() == commander_name.lower():
                continue  # Skip the commander itself
            
            card_features = self.extract_features_with_regex(row.get('oracle_text', ''))
            synergy_score = self.calculate_commander_synergy_score(commander_features, card_features)
            
            if synergy_score > 0:
                recommendations.append({
                    'name': row['name'],
                    'synergy_score': synergy_score,
                    'shared_keywords': list(set(commander_features.keys()) & set(card_features.keys())),
                    'price': row.get('price', 0),
                    'type_line': row.get('type_line', ''),
                    'mana_cost': row.get('mana_cost', '')
                })
        
        # Sort by synergy score and return top recommendations
        recommendations.sort(key=lambda x: x['synergy_score'], reverse=True)
        return recommendations[:num_recommendations]
    
    def analyze_deck_composition(self, deck_cards: list, cards_df: pd.DataFrame) -> dict:
        """Analyze the current composition of a deck and suggest improvements"""
        
        deck_analysis = {
            'themes': defaultdict(int),
            'mana_curve': defaultdict(int),
            'card_types': defaultdict(int),
            'missing_categories': []
        }
        
        # Analyze existing cards
        for card_name in deck_cards:
            card_data = cards_df[cards_df['name'] == card_name]
            if not card_data.empty:
                card_features = self.extract_features_with_regex(card_data.iloc[0].get('oracle_text', ''))
                
                # Categorize themes
                for feature in card_features.keys():
                    if 'counter' in feature:
                        deck_analysis['themes']['counters'] += 1
                    elif 'token' in feature:
                        deck_analysis['themes']['tokens'] += 1
                    elif 'sacrifice' in feature or 'dies' in feature:
                        deck_analysis['themes']['aristocrats'] += 1
                    elif 'spell' in feature or 'prowess' in feature:
                        deck_analysis['themes']['spellslinger'] += 1
        
        # Suggest missing elements
        essential_categories = ['ramp', 'removal', 'card_draw', 'win_conditions']
        # Implementation for detecting missing categories would go here
        
        return deck_analysis


def integrate_with_existing_system():
    """
    Integration example for your existing Flask app
    """
    
    # Initialize the enhanced extractor
    extractor = EnhancedMTGFeatureExtractor()
    
    # Example integration with your app.py
    example_integration = """
    # In your app.py, replace your current feature extraction:
    
    from enhanced_keyword_extractor import EnhancedMTGFeatureExtractor
    
    # Initialize once when app starts
    feature_extractor = EnhancedMTGFeatureExtractor()
    
    @app.route('/recommend', methods=['POST'])
    def recommend_cards():
        commander_name = request.json.get('commander')
        commander_data = get_commander_data(commander_name)  # Your existing function
        
        # Use enhanced feature extraction
        recommendations = feature_extractor.get_recommendations_for_commander(
            commander_name=commander_name,
            commander_text=commander_data['oracle_text'],
            cards_df=your_cards_dataframe,
            num_recommendations=20
        )
        
        return jsonify(recommendations)
    """
    
    return example_integration

# Example usage
if __name__ == "__main__":
    extractor = EnhancedMTGFeatureExtractor()
    
    # Test with some example commander text
    atraxa_text = "Flying, vigilance, deathtouch, lifelink. At the beginning of your end step, proliferate."
    atraxa_features = extractor.extract_features_with_regex(atraxa_text)
    
    print("Atraxa features:", atraxa_features)
    
    # Test card for synergy
    test_card = "When this creature enters the battlefield, put a +1/+1 counter on target creature."
    test_features = extractor.extract_features_with_regex(test_card)
    
    synergy_score = extractor.calculate_commander_synergy_score(atraxa_features, test_features)
    print(f"Synergy score: {synergy_score}")

Atraxa features: {'flying': 1.2, 'deathtouch': 1.4, 'lifelink': 1.2, 'vigilance': 1.1, 'proliferate': 2.5, 'at the beginning of your end step': 1.3}
Synergy score: 0.0


In [14]:
# Let's create a much more comprehensive keyword list combining what we found + manual additions
def create_comprehensive_keyword_list():
    """Create comprehensive 1-3 word MTG keyword list"""
    
    # Start with your discovered high-frequency phrases
    discovered_phrases = [
        'creature', 'this creature', 'you control', 'enters', 'damage', 'end of turn',
        'until end of turn', 'target creature', 'spell', 'counter', 'flying', 'graveyard',
        'artifact', 'damage to', 'sacrifice', 'token', 'when this creature', 'draw',
        'mana', 'creature enters', 'this creature enters', 'exile', 'your graveyard',
        'counter on', 'creature gets', 'your library', 'creature you', 'battlefield',
        'the battlefield', 'deals damage', 'beginning', 'beginning of', 'at the beginning',
        'draw card', 'your hand', 'this turn', 'return', 'creature you control',
        'deals damage to', 'card from', 'this spell', 'this card', 'cast this', 'combat'
    ]
    
    # Add essential combat abilities
    combat_abilities = [
        'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink', 'haste',
        'first strike', 'double strike', 'hexproof', 'indestructible', 'defender',
        'reach', 'menace', 'protection', 'prowess', 'flash', 'crew', 'ward', 'attacking'
    ]
    
    # Add important multi-word combat phrases
    combat_phrases = [
        'first strike', 'double strike', 'combat damage', 'attacking creature',
        'blocking creature', 'deals combat damage', 'when attacks', 'when blocks',
        'whenever attacks', 'whenever blocks', 'attack alone', 'can attack',
        'must attack', 'cannot attack', 'can block', 'cannot block',
        'when this creature attacks', 'whenever this creature attacks',
        'when this creature becomes blocked', 'whenever this creature becomes blocked',
        'when this creature deals combat damage', 'whenever this creature deals combat damage',
        'combat damage to a player', 'combat damage to an opponent',
        'unblocked creature', 'attacking or blocking', 'deals damage to a creature',
        'deals damage to a player', 'deals damage equal to its power', 'can\'t be blocked',
        'must be blocked', 'blocks or becomes blocked', 'first strike damage', 'double strike damage'
    ]
    
    # Add triggered ability patterns
    triggered_abilities = [
        'whenever', 'when enters', 'when dies', 'when attacks', 'when blocks',
        'at the beginning', 'at end of', 'whenever you cast', 'whenever you draw',
        'whenever deals damage', 'whenever takes damage', 'when you gain life',
        'when you lose life', 'whenever discards', 'whenever sacrifices',
        'when enters', 'when this enters', 'when enters the battlefield',
        'whenever enters', 'whenever this enters', 'whenever enters the battlefield',
        'when leaves', 'when this leaves', 'when leaves the battlefield',
        'whenever leaves', 'whenever this leaves', 'whenever leaves the battlefield',
        'when dies', 'when this dies', 'whenever dies', 'whenever this dies',
        'at the beginning of', 'at the end of', 'during each',
        'when you cast', 'whenever you cast', 'when you draw',
        'whenever you draw', 'when you gain life', 'whenever you gain life',
        'when you lose life', 'whenever you lose life',
        'when becomes tapped', 'whenever becomes tapped',
        'when becomes untapped', 'whenever becomes untapped'
    ]
    
    # Add important card type combinations
    card_types = [
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'basic land', 'nonbasic land', 'creature token', 'artifact token',
        'instant spell', 'sorcery spell', 'creature spell', 'noncreature spell',
        'instant', 'sorcery', 'enchantment', 'artifact', 'planeswalker',
        'land', 'creature', 'tribal', 'legendary', 'basic', 'snow',
        'legendary sorcery'
    ]
    
    # Add mana and cost related phrases
    mana_phrases = [
        'mana cost', 'mana value', 'converted mana', 'additional cost',
        'without paying', 'costs less', 'costs more', 'add mana',
        'pay mana', 'spend mana', 'colorless mana', 'colored mana',
        'add mana', 'mana of any color', 'treasure token',
        'food token', 'clue token', 'blood token', 'gold token',
        'draw a card', 'draw cards', 'draw two cards', 'draw three cards',
        'discard a card', 'discard cards', 'exile a card', 'exile cards',
        'return to hand', 'return from graveyard', 'search your library',
        'costs less to cast', 'reduce the cost', 'without paying',
        'mana cost', 'converted mana cost', 'mana value'
    ]
    
    # Add +1/+1 counter and proliferate synergies  
    counter_phrases = [
        'counter', 'counters', '+1/+1 counter', '-1/-1 counter', 'loyalty counter',
        'charge counter', 'time counter', 'proliferate', 'put counter',
        'remove counter', 'counter on it', 'with counters', 'number of counters',
        '+1/+1 counters', 'charge counters', 'loyalty counters', 'time counters',
        'age counter', 'age counters', 'poison counter', 'poison counters',
        'energy counter', 'energy counters', 'experience counter', 'experience counters',
        'oil counter', 'oil counters', 'shield counter', 'shield counters',
        'stun counter', 'stun counters', 'remove a counter', 'place a counter',
        'put a counter', 'with counters', 'counter on it', 'counters on it',
        'doubling season', 'hardened scales'
    ]
    
    # Add tribal synergies
    tribal_keywords = [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'rogue', 'cleric', 'shaman', 'merfolk', 'insect', 'cat', 'dog', 'bird',
        'treefolk', 'avatar', 'horror', 'eldrazi', 'phyrexian', 'sliver', 'ally',
        'fish', 'spider', 'snake', 'dinosaur', 'giant', 'faerie'
    ]
    
    # Add zone and library manipulation
    zone_phrases = [
        'from graveyard', 'to graveyard', 'from exile', 'to exile',
        'from your hand', 'to your hand', 'from library', 'search library',
        'shuffle library', 'top of library', 'bottom of library',
        'enters battlefield', 'leaves battlefield', 'return to battlefield',
        'enters from graveyard', 'return from graveyard', 'cast from graveyard',
        'graveyard to battlefield', 'mill', 'self-mill', 'dredge',
        'delve', 'escape', 'flashback', 'unearth', 'reanimate',
        'graveyard matters', 'threshold', 'delirium', 'undergrowth'
    ]
    
    # Add sacrifice and destruction themes
    sacrifice_phrases = [
        'sacrifice', 'destroy', 'destroy target', 'sacrifice creature',
        'sacrifice artifact', 'sacrifice enchantment', 'sacrifice land',
        'when sacrificed', 'when destroyed', 'destroy all', 'sacrifice all'
    ]
    
    # Add draw and card advantage
    card_advantage = [
        'draw card', 'draw cards', 'draw additional', 'discard card',
        'discard cards', 'reveal card', 'reveal cards', 'look at',
        'search for', 'return to hand', 'mill cards', 'exile cards',
        'scry', 'surveil', 'look at the top card', 'look at the top X cards'
    ]
    
    # Add evergreen keywords
    evergreen_keywords = [
        'deathtouch', 'defender', 'double strike', 'enchant', 'equip', 
        'first strike', 'flash', 'flying', 'haste', 'hexproof', 'indestructible',
        'lifelink', 'menace', 'protection', 'reach', 'trample', 'vigilance', 'ward'
    ]
    
    # Add ability words (italicized, set themes)
    ability_words = [
        'addendum', 'alliance', 'battalion', 'bloodrush', 'channel', 'chroma',
        'cohort', 'constellation', 'converge', 'council\'s dilemma', 'delirium',
        'domain', 'eminence', 'enrage', 'fateful hour', 'ferocious', 'formidable',
        'grandeur', 'hellbent', 'heroic', 'imprint', 'inspired', 'join forces',
        'kinship', 'landfall', 'lieutenant', 'metalcraft', 'morbid', 'parley',
        'radiance', 'raid', 'rally', 'revolt', 'spell mastery', 'strive',
        'sweep', 'tempting offer', 'threshold', 'undergrowth', 'will of the council'
    ]
    
    # Add keyword abilities (rules text)
    keyword_abilities = [
        'absorb', 'affinity', 'amplify', 'annihilator', 'ascend', 'assist', 'awaken',
        'backup', 'banding', 'basic landcycling', 'bestow', 'bloodthirst', 'bushido',
        'buyback', 'cascade', 'champion', 'changeling', 'cipher', 'clash', 'cleave',
        'conspire', 'convoke', 'crew', 'cumulative upkeep', 'cycling', 'dash',
        'daybound', 'delve', 'demonstrate', 'dethrone', 'devoid', 'devour', 'disturb',
        'dredge', 'echo', 'emerge', 'embalm', 'encore', 'entwine', 'epic', 'escape',
        'eternalize', 'evoke', 'evolve', 'exalted', 'exploit', 'explore', 'extort',
        'fabricate', 'fading', 'flanking', 'flashback', 'forecast', 'fortify',
        'frenzy', 'graft', 'gravestorm', 'habitat', 'hideaway', 'horsemanship',
        'improvise', 'infect', 'intimidate', 'investigate', 'jump-start', 'kicker',
        'learn', 'level up', 'living weapon', 'madness', 'melee', 'mentor', 'miracle',
        'modular', 'morph', 'multikicker', 'mutate', 'nightbound', 'ninjutsu',
        'offering', 'outlast', 'overload', 'partner', 'persist', 'phasing', 'plainscycling',
        'poisonous', 'proliferate', 'provoke', 'prowess', 'rampage', 'rebound',
        'reconfigure', 'recover', 'reinforce', 'renown', 'replicate', 'retrace',
        'riot', 'ripple', 'scavenge', 'shadow', 'shroud', 'soulbond', 'soulshift',
        'splice', 'split second', 'storm', 'sunburst', 'suspend', 'totem armor',
        'training', 'transform', 'transmute', 'tribute', 'undaunted', 'undying',
        'unearth', 'unleash', 'vanishing', 'venture', 'wither'
    ]
    
    # Add activated ability patterns
    activated_abilities = [
        'tap:', 'untap:', 'sacrifice', 'pay', 'discard', 'exile',
        'return to hand', 'put into graveyard', 'search your library',
        'shuffle your library', 'reveal', 'look at', 'choose',
        'target', 'destroy', 'counter', 'prevent', 'redirect'
    ]
    
    # Add win conditions and powerful effects
    wincon_phrases = [
        'you win the game', 'target player loses the game',
        'loses the game', 'wins the game', 'can\'t lose the game',
        'can\'t win the game', 'alternate win condition',
        'infinite combo', 'goes infinite', 'commander damage',
        'poison counters', 'mill', 'laboratory maniac',
        'thassa\'s oracle', 'approach of the second sun'
    ]
    
    # Add protection and defensive phrases
    protection_phrases = [
        'protection from', 'can\'t be targeted', 'can\'t be blocked',
        'can\'t be countered', 'can\'t be destroyed', 'prevent all damage',
        'prevent damage', 'redirect damage', 'damage prevention',
        'shroud', 'hexproof', 'ward', 'indestructible', 'regenerate'
    ]
    
    # Add artifact synergies
    artifact_synergies = [
        'artifact', 'artifacts', 'equipment', 'vehicle', 'treasure',
        'affinity for artifacts', 'artifact creature', 'artifact token',
        'metalcraft', 'improvise', 'crew', 'equip', 'attach',
        'artifact enters', 'artifact dies', 'noncreature artifact'
    ]
    
    # Add enchantment synergies
    enchantment_synergies = [
        'enchantment', 'enchantments', 'aura', 'saga', 'curse',
        'enchant', 'enchanted', 'constellation', 'enchantment creature',
        'enchantment token', 'bestow', 'totem armor'
    ]
    
    # Add planeswalker synergies
    planeswalker_synergies = [
        'planeswalker', 'planeswalkers', 'loyalty', 'loyalty counter',
        'activate loyalty abilities', 'superfriends', 'spark double',
        'doubling season', 'proliferate', 'plus ability', 'minus ability',
        'ultimate ability', 'loyalty ability'
    ]
    
    # Add land synergies
    land_synergies = [
        'land', 'lands', 'basic land', 'nonbasic land', 'landfall',
        'enters tapped', 'sacrifice a land', 'search for a land',
        'land enters', 'land drops', 'additional land', 'extra land',
        'ramp', 'mana dork', 'mana rock', 'land ramp'
    ]
    
    # Add spellslinger themes
    spellslinger_phrases = [
        'instant or sorcery', 'noncreature spell', 'spell', 'spells',
        'cast a spell', 'whenever you cast', 'prowess', 'storm',
        'spell mastery', 'magecraft', 'copy spell', 'fork',
        'flashback', 'buyback', 'rebound', 'cipher'
    ]
    
    # Add ETB (Enter the Battlefield) variations
    etb_variations = [
        'enters the battlefield', 'when this enters the battlefield',
        'whenever a creature enters the battlefield', 'when another creature enters the battlefield',
        'enters the battlefield tapped', 'enters the battlefield with',
        'enters the battlefield under your control'
    ]
    
    # Add beginning/end step triggers
    step_triggers = [
        'at the beginning of your upkeep', 'at the beginning of each upkeep',
        'at the beginning of your end step', 'at the beginning of each end step',
        'at the beginning of combat', 'at the beginning of your draw step',
        'at the beginning of each player\'s upkeep', 'during each player\'s turn',
        'once each turn', 'during your turn', 'on your turn'
    ]
    
    # Add spell casting triggers
    spell_triggers = [
        'whenever you cast a spell', 'whenever you cast an instant or sorcery spell',
        'whenever you cast a creature spell', 'whenever you cast a noncreature spell',
        'whenever you cast your first spell', 'whenever you cast your second spell'
    ]
    
    # Add token creation
    token_creation = [
        'create a token', 'create a creature token', 'create a treasure token',
        'create a food token', 'create a clue token', 'create a blood token',
        'token creature', 'populate'
    ]
    
    # Add evasion abilities
    evasion_abilities = [
        'unblockable', 'can\'t be blocked', 'must be blocked',
        'shadow', 'fear', 'intimidate', 'skulk'
    ]
    
    # Add tribal support phrases
    tribal_support = [
        'other', 'choose a creature type', 'of the chosen type',
        'shares a creature type', 'tribal spell'
    ]
    
    # Add Commander-specific keywords
    commander_specific = [
        'partner', 'partner with', 'background', 'choose a background',
        'eminence', 'command zone', 'commander tax'
    ]
    
    # Add multiplayer politics
    multiplayer_politics = [
        'each opponent', 'each player', 'target opponent',
        'choose a player', 'vote', 'council\'s dilemma'
    ]
    
    # Combine all lists
    comprehensive_keywords = (
        discovered_phrases + combat_abilities + combat_phrases + 
        triggered_abilities + card_types + mana_phrases + 
        counter_phrases + tribal_keywords + zone_phrases + 
        sacrifice_phrases + card_advantage + evergreen_keywords +
        ability_words + keyword_abilities + activated_abilities +
        wincon_phrases + protection_phrases + artifact_synergies +
        enchantment_synergies + planeswalker_synergies + land_synergies +
        spellslinger_phrases + etb_variations + step_triggers +
        spell_triggers + token_creation + evasion_abilities +
        tribal_support + commander_specific + multiplayer_politics
    )
    
    # Remove duplicates and sort
    comprehensive_keywords = sorted(list(set(comprehensive_keywords)))
    
    return comprehensive_keywords

# Create the comprehensive keyword list
comprehensive_keywords = create_comprehensive_keyword_list()
print(f"Comprehensive keyword list: {len(comprehensive_keywords)} total keywords")

# Show categories
print("\n📝 Sample keywords by category:")
print("Combat:", [k for k in comprehensive_keywords if any(word in k for word in ['strike', 'combat', 'attack', 'block', 'flying', 'trample'])][:10])
print("Triggered:", [k for k in comprehensive_keywords if any(word in k for word in ['when', 'whenever', 'beginning', 'end'])][:8])
print("Counters:", [k for k in comprehensive_keywords if 'counter' in k][:8])
print("Tribal:", [k for k in comprehensive_keywords if k in ['human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel']])
print("ETB:", [k for k in comprehensive_keywords if 'enters' in k][:5])
print("Spellslinger:", [k for k in comprehensive_keywords if any(word in k for word in ['spell', 'cast', 'instant', 'sorcery'])][:5])
print("Win Conditions:", [k for k in comprehensive_keywords if any(word in k for word in ['win', 'lose', 'damage', 'poison'])][:5])

print(f"\n🎯 Total comprehensive keywords: {len(comprehensive_keywords)}")

# Optional: Save to file for easy import
def save_keywords_to_file(keywords, filename="mtg_comprehensive_keywords 20250610.txt"):
    """Save keywords to a text file for easy loading"""
    with open(filename, 'w') as f:
        for keyword in keywords:
            f.write(f"{keyword}\n")
    save_keywords_to_file(comprehensive_keywords)
    print(f"💾 Keywords saved to {filename}")

Comprehensive keyword list: 587 total keywords

📝 Sample keywords by category:
Combat: ['at the beginning of combat', 'attack alone', 'attacking', 'attacking creature', 'attacking or blocking', 'blocking creature', 'blocks or becomes blocked', 'can attack', 'can block', "can't be blocked"]
Triggered: ['addendum', 'ascend', 'at end of', 'at the beginning', 'at the beginning of', 'at the beginning of combat', 'at the beginning of each end step', "at the beginning of each player's upkeep"]
Counters: ['+1/+1 counter', '+1/+1 counters', '-1/-1 counter', 'age counter', 'age counters', "can't be countered", 'charge counter', 'charge counters']
Tribal: ['angel', 'dragon', 'elf', 'goblin', 'human', 'vampire', 'zombie']
ETB: ['artifact enters', 'creature enters', 'enters', 'enters battlefield', 'enters from graveyard']
Spellslinger: ['cast a spell', 'cast from graveyard', 'cast this', 'copy spell', 'costs less to cast']
Win Conditions: ['alternate win condition', "can't lose the game", "can't wi

In [15]:
# Let's create a much more comprehensive keyword list combining what we found + manual additions
def create_comprehensive_keyword_list():
    """Create comprehensive 1-3 word MTG keyword list"""
    
    # Start with your discovered high-frequency phrases
    discovered_phrases = [
        'creature', 'this creature', 'you control', 'enters', 'damage', 'end of turn',
        'until end of turn', 'target creature', 'spell', 'counter', 'flying', 'graveyard',
        'artifact', 'damage to', 'sacrifice', 'token', 'when this creature', 'draw',
        'mana', 'creature enters', 'this creature enters', 'exile', 'your graveyard',
        'counter on', 'creature gets', 'your library', 'creature you', 'battlefield',
        'the battlefield', 'deals damage', 'beginning', 'beginning of', 'at the beginning',
        'draw card', 'your hand', 'this turn', 'return', 'creature you control',
        'deals damage to', 'card from', 'this spell', 'this card', 'cast this', 'combat'
    ]
    
    # Add essential combat abilities
    combat_abilities = [
        'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink', 'haste',
        'first strike', 'double strike', 'hexproof', 'indestructible', 'defender',
        'reach', 'menace', 'protection', 'prowess', 'flash', 'crew', 'ward'
    ]
    
    # Add important multi-word combat phrases
    combat_phrases = [
        'first strike', 'double strike', 'combat damage', 'attacking creature',
        'blocking creature', 'deals combat damage', 'when attacks', 'when blocks',
        'whenever attacks', 'whenever blocks', 'attack alone', 'can attack',
        'must attack', 'cannot attack', 'can block', 'cannot block',
        'when this creature attacks', 'whenever this creature attacks',
        'when this creature becomes blocked', 'whenever this creature becomes blocked',
        'when this creature deals combat damage', 'whenever this creature deals combat damage',
        'combat damage to a player', 'combat damage to an opponent',
        'unblocked creature', 'attacking or blocking', 'deals damage to a creature',
        'deals damage to a player', 'deals damage equal to its power', 'can\'t be blocked',
        'must be blocked', 'blocks or becomes blocked', 'first strike damage', 'double strike damage',
        'attacking'
    ]
    
    # Add triggered ability patterns
    triggered_abilities = [
        'whenever', 'when enters', 'when dies', 'when attacks', 'when blocks',
        'at the beginning', 'at end of', 'whenever you cast', 'whenever you draw',
        'whenever deals damage', 'whenever takes damage', 'when you gain life',
        'when you lose life', 'whenever discards', 'whenever sacrifices',
        'when enters', 'when this enters', 'when enters the battlefield',
        'whenever enters', 'whenever this enters', 'whenever enters the battlefield',
        'when leaves', 'when this leaves', 'when leaves the battlefield',
        'whenever leaves', 'whenever this leaves', 'whenever leaves the battlefield',
        'when dies', 'when this dies', 'whenever dies', 'whenever this dies',
        'at the beginning of', 'at the end of', 'during each',
        'when you cast', 'whenever you cast', 'when you draw',
        'whenever you draw', 'when you gain life', 'whenever you gain life',
        'when you lose life', 'whenever you lose life',
        'when becomes tapped', 'whenever becomes tapped',
        'when becomes untapped', 'whenever becomes untapped'
    ]
    
    # Add important card type combinations
    card_types = [
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'basic land', 'nonbasic land', 'creature token', 'artifact token',
        'instant spell', 'sorcery spell', 'creature spell', 'noncreature spell',
        'instant', 'sorcery', 'enchantment', 'artifact', 'planeswalker',
        'land', 'creature', 'tribal', 'legendary', 'basic', 'snow',
        'legendary sorcery'
    ]
    
    # Add mana and cost related phrases
    mana_phrases = [
        'mana cost', 'mana value', 'converted mana', 'additional cost',
        'without paying', 'costs less', 'costs more', 'add mana',
        'pay mana', 'spend mana', 'colorless mana', 'colored mana',
        'add mana', 'mana of any color', 'treasure token',
        'food token', 'clue token', 'blood token', 'gold token',
        'draw a card', 'draw cards', 'draw two cards', 'draw three cards',
        'discard a card', 'discard cards', 'exile a card', 'exile cards',
        'return to hand', 'return from graveyard', 'search your library',
        'costs less to cast', 'reduce the cost', 'without paying',
        'mana cost', 'converted mana cost', 'mana value'
    ]
    
    # Add +1/+1 counter and proliferate synergies  
    counter_phrases = [
        'counter', 'counters', '+1/+1 counter', '-1/-1 counter', 'loyalty counter',
        'charge counter', 'time counter', 'proliferate', 'put counter',
        'remove counter', 'counter on it', 'with counters', 'number of counters',
        '+1/+1 counters', 'charge counters', 'loyalty counters', 'time counters',
        'age counter', 'age counters', 'poison counter', 'poison counters',
        'energy counter', 'energy counters', 'experience counter', 'experience counters',
        'oil counter', 'oil counters', 'shield counter', 'shield counters',
        'stun counter', 'stun counters', 'remove a counter', 'place a counter',
        'put a counter', 'with counters', 'counter on it', 'counters on it',
        'doubling season', 'hardened scales'
    ]
    
    # Add tribal synergies
    tribal_keywords = [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'rogue', 'cleric', 'shaman', 'merfolk', 'insect', 'cat', 'dog', 'bird',
        'treefolk', 'avatar', 'horror', 'eldrazi', 'phyrexian', 'sliver', 'ally',
        'fish', 'spider', 'snake', 'dinosaur', 'giant', 'faerie'
    ]
    
    # Add zone and library manipulation
    zone_phrases = [
        'from graveyard', 'to graveyard', 'from exile', 'to exile',
        'from your hand', 'to your hand', 'from library', 'search library',
        'shuffle library', 'top of library', 'bottom of library',
        'enters battlefield', 'leaves battlefield', 'return to battlefield',
        'enters from graveyard', 'return from graveyard', 'cast from graveyard',
        'graveyard to battlefield', 'mill', 'self-mill', 'dredge',
        'delve', 'escape', 'flashback', 'unearth', 'reanimate',
        'graveyard matters', 'threshold', 'delirium', 'undergrowth'
    ]
    
    # Add sacrifice and destruction themes
    sacrifice_phrases = [
        'sacrifice', 'destroy', 'destroy target', 'sacrifice creature',
        'sacrifice artifact', 'sacrifice enchantment', 'sacrifice land',
        'when sacrificed', 'when destroyed', 'destroy all', 'sacrifice all'
    ]
    
    # Add draw and card advantage
    card_advantage = [
        'draw card', 'draw cards', 'draw additional', 'discard card',
        'discard cards', 'reveal card', 'reveal cards', 'look at',
        'search for', 'return to hand', 'mill cards', 'exile cards',
        'scry', 'surveil', 'look at the top card', 'look at the top X cards'
    ]
    
    # Add evergreen keywords
    evergreen_keywords = [
        'deathtouch', 'defender', 'double strike', 'enchant', 'equip', 
        'first strike', 'flash', 'flying', 'haste', 'hexproof', 'indestructible',
        'lifelink', 'menace', 'protection', 'reach', 'trample', 'vigilance', 'ward'
    ]
    
    # Add ability words (italicized, set themes)
    ability_words = [
        'addendum', 'alliance', 'battalion', 'bloodrush', 'channel', 'chroma',
        'cohort', 'constellation', 'converge', 'council\'s dilemma', 'delirium',
        'domain', 'eminence', 'enrage', 'fateful hour', 'ferocious', 'formidable',
        'grandeur', 'hellbent', 'heroic', 'imprint', 'inspired', 'join forces',
        'kinship', 'landfall', 'lieutenant', 'metalcraft', 'morbid', 'parley',
        'radiance', 'raid', 'rally', 'revolt', 'spell mastery', 'strive',
        'sweep', 'tempting offer', 'threshold', 'undergrowth', 'will of the council'
    ]
    
    # Add keyword abilities (rules text)
    keyword_abilities = [
        'absorb', 'affinity', 'amplify', 'annihilator', 'ascend', 'assist', 'awaken',
        'backup', 'banding', 'basic landcycling', 'bestow', 'bloodthirst', 'bushido',
        'buyback', 'cascade', 'champion', 'changeling', 'cipher', 'clash', 'cleave',
        'conspire', 'convoke', 'crew', 'cumulative upkeep', 'cycling', 'dash',
        'daybound', 'delve', 'demonstrate', 'dethrone', 'devoid', 'devour', 'disturb',
        'dredge', 'echo', 'emerge', 'embalm', 'encore', 'entwine', 'epic', 'escape',
        'eternalize', 'evoke', 'evolve', 'exalted', 'exploit', 'explore', 'extort',
        'fabricate', 'fading', 'flanking', 'flashback', 'forecast', 'fortify',
        'frenzy', 'graft', 'gravestorm', 'habitat', 'hideaway', 'horsemanship',
        'improvise', 'infect', 'intimidate', 'investigate', 'jump-start', 'kicker',
        'learn', 'level up', 'living weapon', 'madness', 'melee', 'mentor', 'miracle',
        'modular', 'morph', 'multikicker', 'mutate', 'nightbound', 'ninjutsu',
        'offering', 'outlast', 'overload', 'partner', 'persist', 'phasing', 'plainscycling',
        'poisonous', 'proliferate', 'provoke', 'prowess', 'rampage', 'rebound',
        'reconfigure', 'recover', 'reinforce', 'renown', 'replicate', 'retrace',
        'riot', 'ripple', 'scavenge', 'shadow', 'shroud', 'soulbond', 'soulshift',
        'splice', 'split second', 'storm', 'sunburst', 'suspend', 'totem armor',
        'training', 'transform', 'transmute', 'tribute', 'undaunted', 'undying',
        'unearth', 'unleash', 'vanishing', 'venture', 'wither'
    ]
    
    # Add activated ability patterns
    activated_abilities = [
        'tap:', 'untap:', 'sacrifice', 'pay', 'discard', 'exile',
        'return to hand', 'put into graveyard', 'search your library',
        'shuffle your library', 'reveal', 'look at', 'choose',
        'target', 'destroy', 'counter', 'prevent', 'redirect'
    ]
    
    # Add win conditions and powerful effects
    wincon_phrases = [
        'you win the game', 'target player loses the game',
        'loses the game', 'wins the game', 'can\'t lose the game',
        'can\'t win the game', 'alternate win condition',
        'infinite combo', 'goes infinite', 'commander damage',
        'poison counters', 'mill', 'laboratory maniac',
        'thassa\'s oracle', 'approach of the second sun'
    ]
    
    # Add protection and defensive phrases
    protection_phrases = [
        'protection from', 'can\'t be targeted', 'can\'t be blocked',
        'can\'t be countered', 'can\'t be destroyed', 'prevent all damage',
        'prevent damage', 'redirect damage', 'damage prevention',
        'shroud', 'hexproof', 'ward', 'indestructible', 'regenerate'
    ]
    
    # Add artifact synergies
    artifact_synergies = [
        'artifact', 'artifacts', 'equipment', 'vehicle', 'treasure',
        'affinity for artifacts', 'artifact creature', 'artifact token',
        'metalcraft', 'improvise', 'crew', 'equip', 'attach',
        'artifact enters', 'artifact dies', 'noncreature artifact'
    ]
    
    # Add enchantment synergies
    enchantment_synergies = [
        'enchantment', 'enchantments', 'aura', 'saga', 'curse',
        'enchant', 'enchanted', 'constellation', 'enchantment creature',
        'enchantment token', 'bestow', 'totem armor'
    ]
    
    # Add planeswalker synergies
    planeswalker_synergies = [
        'planeswalker', 'planeswalkers', 'loyalty', 'loyalty counter',
        'activate loyalty abilities', 'superfriends', 'spark double',
        'doubling season', 'proliferate', 'plus ability', 'minus ability',
        'ultimate ability', 'loyalty ability'
    ]
    
    # Add land synergies
    land_synergies = [
        'land', 'lands', 'basic land', 'nonbasic land', 'landfall',
        'enters tapped', 'sacrifice a land', 'search for a land',
        'land enters', 'land drops', 'additional land', 'extra land',
        'ramp', 'mana dork', 'mana rock', 'land ramp'
    ]
    
    # Add spellslinger themes
    spellslinger_phrases = [
        'instant or sorcery', 'noncreature spell', 'spell', 'spells',
        'cast a spell', 'whenever you cast', 'prowess', 'storm',
        'spell mastery', 'magecraft', 'copy spell', 'fork',
        'flashback', 'buyback', 'rebound', 'cipher'
    ]
    
    # Add ETB (Enter the Battlefield) variations
    etb_variations = [
        'enters the battlefield', 'when this enters the battlefield',
        'whenever a creature enters the battlefield', 'when another creature enters the battlefield',
        'enters the battlefield tapped', 'enters the battlefield with',
        'enters the battlefield under your control'
    ]
    
    # Add beginning/end step triggers
    step_triggers = [
        'at the beginning of your upkeep', 'at the beginning of each upkeep',
        'at the beginning of your end step', 'at the beginning of each end step',
        'at the beginning of combat', 'at the beginning of your draw step',
        'at the beginning of each player\'s upkeep', 'during each player\'s turn',
        'once each turn', 'during your turn', 'on your turn'
    ]
    
    # Add spell casting triggers
    spell_triggers = [
        'whenever you cast a spell', 'whenever you cast an instant or sorcery spell',
        'whenever you cast a creature spell', 'whenever you cast a noncreature spell',
        'whenever you cast your first spell', 'whenever you cast your second spell'
    ]
    
    # Add token creation
    token_creation = [
        'create a token', 'create a creature token', 'create a treasure token',
        'create a food token', 'create a clue token', 'create a blood token',
        'token creature', 'populate'
    ]
    
    # Add evasion abilities
    evasion_abilities = [
        'unblockable', 'can\'t be blocked', 'must be blocked',
        'shadow', 'fear', 'intimidate', 'skulk'
    ]
    
    # Add tribal support phrases
    tribal_support = [
        'other', 'choose a creature type', 'of the chosen type',
        'shares a creature type', 'tribal spell'
    ]
    
    # Add Commander-specific keywords
    commander_specific = [
        'partner', 'partner with', 'background', 'choose a background',
        'eminence', 'command zone', 'commander tax'
    ]
    
    # Add multiplayer politics
    multiplayer_politics = [
        'each opponent', 'each player', 'target opponent',
        'choose a player', 'vote', 'council\'s dilemma'
    ]
    
    # Combine all lists
    comprehensive_keywords = (
        discovered_phrases + combat_abilities + combat_phrases + 
        triggered_abilities + card_types + mana_phrases + 
        counter_phrases + tribal_keywords + zone_phrases + 
        sacrifice_phrases + card_advantage + evergreen_keywords +
        ability_words + keyword_abilities + activated_abilities +
        wincon_phrases + protection_phrases + artifact_synergies +
        enchantment_synergies + planeswalker_synergies + land_synergies +
        spellslinger_phrases + etb_variations + step_triggers +
        spell_triggers + token_creation + evasion_abilities +
        tribal_support + commander_specific + multiplayer_politics
    )
    
    # Remove duplicates and sort
    comprehensive_keywords = sorted(list(set(comprehensive_keywords)))
    
    return comprehensive_keywords

# Create the comprehensive keyword list
comprehensive_keywords = create_comprehensive_keyword_list()
print(f"Comprehensive keyword list: {len(comprehensive_keywords)} total keywords")

# Show categories
print("\n📝 Sample keywords by category:")
print("Combat:", [k for k in comprehensive_keywords if any(word in k for word in ['strike', 'combat', 'attack', 'block', 'flying', 'trample'])][:10])
print("Triggered:", [k for k in comprehensive_keywords if any(word in k for word in ['when', 'whenever', 'beginning', 'end'])][:8])
print("Counters:", [k for k in comprehensive_keywords if 'counter' in k][:8])
print("Tribal:", [k for k in comprehensive_keywords if k in ['human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel']])
print("ETB:", [k for k in comprehensive_keywords if 'enters' in k][:5])
print("Spellslinger:", [k for k in comprehensive_keywords if any(word in k for word in ['spell', 'cast', 'instant', 'sorcery'])][:5])
print("Win Conditions:", [k for k in comprehensive_keywords if any(word in k for word in ['win', 'lose', 'damage', 'poison'])][:5])

print(f"\n🎯 Total comprehensive keywords: {len(comprehensive_keywords)}")

# Optional: Save to file for easy import
def save_keywords_to_file(keywords, filename="mtg_keywords 20250610.txt"):
    """Save keywords to a text file for easy loading"""
    with open(filename, 'w') as f:
        for keyword in keywords:
            f.write(f"{keyword}\n")
    print(f"💾 Keywords saved to {filename}")

# Uncomment to save:
save_keywords_to_file(comprehensive_keywords)

Comprehensive keyword list: 587 total keywords

📝 Sample keywords by category:
Combat: ['at the beginning of combat', 'attack alone', 'attacking', 'attacking creature', 'attacking or blocking', 'blocking creature', 'blocks or becomes blocked', 'can attack', 'can block', "can't be blocked"]
Triggered: ['addendum', 'ascend', 'at end of', 'at the beginning', 'at the beginning of', 'at the beginning of combat', 'at the beginning of each end step', "at the beginning of each player's upkeep"]
Counters: ['+1/+1 counter', '+1/+1 counters', '-1/-1 counter', 'age counter', 'age counters', "can't be countered", 'charge counter', 'charge counters']
Tribal: ['angel', 'dragon', 'elf', 'goblin', 'human', 'vampire', 'zombie']
ETB: ['artifact enters', 'creature enters', 'enters', 'enters battlefield', 'enters from graveyard']
Spellslinger: ['cast a spell', 'cast from graveyard', 'cast this', 'copy spell', 'costs less to cast']
Win Conditions: ['alternate win condition', "can't lose the game", "can't wi

In [16]:
def create_comprehensive_keyword_features(df, keywords):
    """Create keyword features with comprehensive multi-word list"""
    keyword_matrix = []
    
    print(f"Creating features with {len(keywords)} keywords...")
    
    for idx, row in df.iterrows():
        if idx % 5000 == 0:
            print(f"  Processing card {idx}/{len(df)}")
            
        text = row['combined_text'].lower()
        features = []
        
        for keyword in keywords:
            # Count occurrences (cap at 3)
            count = text.count(keyword.lower())
            features.append(min(count, 3))
            
        keyword_matrix.append(features)
    
    return np.array(keyword_matrix), keywords

# Create comprehensive keyword matrix
print("Creating comprehensive keyword matrix...")
comprehensive_matrix, final_keywords = create_comprehensive_keyword_features(df_clean, comprehensive_keywords)
print(f"Matrix shape: {comprehensive_matrix.shape}")

# Test with Isshin and other commanders
test_commanders = ['Isshin']

for commander_name in test_commanders:
    matches = df_clean[df_clean['name'].str.contains(commander_name, case=False, na=False)]
    if len(matches) > 0:
        idx = matches.index[0]
        features = comprehensive_matrix[idx]
        matched_keywords = [final_keywords[i] for i in range(len(final_keywords)) if features[i] > 0]
        
        print(f"\n⚔️ {matches.iloc[0]['name']}:")
        print(f"   Matched keywords ({len(matched_keywords)}): {matched_keywords[:15]}...")
        print(f"   Total feature count: {sum(features)}")
    else:
        print(f"\n❌ {commander_name} not found")

Creating comprehensive keyword matrix...
Creating features with 587 keywords...
  Processing card 0/27290
  Processing card 5000/27290
  Processing card 10000/27290
  Processing card 15000/27290
  Processing card 20000/27290
  Processing card 25000/27290
Matrix shape: (27290, 587)

⚔️ Isshin, Two Heavens as One:
   Matched keywords (6): ['attacking', 'creature', 'human', 'legendary', 'legendary creature', 'you control']...
   Total feature count: 7


In [20]:
import re
from typing import Set, List, Dict

def create_simplified_keyword_system():
    """Create a clean keyword system using base words + automatic variations"""
    
    # Base single words (we'll handle plurals automatically)
    base_words = {
        # Card types
        'creature', 'artifact', 'enchantment', 'planeswalker', 'land', 'instant', 'sorcery',
        'spell', 'token', 'counter', 'damage', 'mana',
        
        # Zones & actions
        'graveyard', 'battlefield', 'exile', 'library', 'hand',
        'sacrifice', 'destroy', 'draw', 'discard', 'cast', 'target',
        'enter', 'leave', 'attack', 'block', 'die', 'tap', 'untap',
        'search', 'reveal', 'return', 'mill', 'ramp',
        
        # Combat & abilities
        'combat', 'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink',
        'haste', 'hexproof', 'indestructible', 'defender', 'reach', 'menace',
        'prowess', 'flash', 'ward',
        
        # Creature types (tribal)
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'merfolk', 'insect', 'cat', 'dog', 'bird',
        
        # Mechanics
        'proliferate', 'convoke', 'delve', 'cascade', 'storm', 'flashback',
        'buyback', 'rebound', 'cipher', 'dredge', 'evolve', 'explore',
        'fabricate', 'improvise', 'investigate', 'mentor', 'riot', 'adapt',
        'scavenge', 'unleash', 'undying', 'persist', 'modular', 'graft',
        
        # Equipment & auras
        'equip', 'equipment', 'aura', 'enchant', 'attach',
        
        # Counters
        'loyalty', 'charge', 'time', 'energy', 'experience', 'poison',
        'age', 'oil', 'shield', 'stun',
        
        # Keywords
        'partner', 'eminence', 'landfall', 'metalcraft', 'threshold',
        'delirium', 'undergrowth', 'constellation', 'battalion'
    }
    
    # Multi-word phrases that should stay exact
    exact_phrases = {
        # Evergreen abilities
        'first strike', 'double strike', 'split second', 'living weapon',
        'totem armor', 'basic landcycling', 'cumulative upkeep', 'level up',
        
        # Complex triggers
        'enters the battlefield', 'leaves the battlefield',
        'deals combat damage', 'combat damage to a player', 'combat damage to an opponent',
        'when this creature attacks', 'whenever this creature attacks',
        'when this creature dies', 'whenever this creature dies',
        'at the beginning of your upkeep', 'at the beginning of each upkeep',
        'at the beginning of your end step', 'at the beginning of combat',
        'whenever you cast a spell', 'whenever you cast an instant or sorcery',
        'whenever you cast a creature spell', 'whenever you cast a noncreature spell',
        
        # Counter types with +/-
        '+1/+1 counter', '-1/-1 counter', 'loyalty counter', 'charge counter',
        'time counter', 'energy counter', 'experience counter', 'poison counter',
        
        # Card advantage
        'draw a card', 'draw cards', 'discard a card', 'exile a card',
        'return to hand', 'return from graveyard', 'search your library',
        
        # Mana & costs
        'mana cost', 'mana value', 'converted mana cost', 'without paying',
        'costs less to cast', 'add mana', 'mana of any color',
        
        # Token creation
        'create a token', 'creature token', 'artifact token', 'treasure token',
        'food token', 'clue token', 'blood token',
        
        # Card types
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'basic land', 'nonbasic land', 'instant or sorcery', 'noncreature spell',
        
        # Protection & evasion
        'protection from', 'can\'t be targeted', 'can\'t be blocked',
        'can\'t be countered', 'can\'t be destroyed', 'prevent damage',
        
        # Win conditions
        'you win the game', 'target player loses the game', 'commander damage',
        'laboratory maniac', 'thassa\'s oracle',
        
        # Commander specific
        'partner with', 'choose a background', 'command zone', 'commander tax',
        
        # Multiplayer
        'each opponent', 'each player', 'target opponent', 'choose a player',
        
        # Common phrases
        'you control', 'target creature', 'this creature', 'end of turn',
        'until end of turn', 'this turn', 'creature you control'
    }
    
    # Tribal phrases (using "other" pattern)
    tribal_phrases = {f'other {creature}' for creature in [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight'
    ]}
    
    return base_words, exact_phrases | tribal_phrases

def generate_word_variations(base_word: str) -> List[str]:
    """Generate common variations of a base word"""
    variations = [base_word]
    
    # Handle plurals
    if base_word.endswith('y'):
        variations.append(base_word[:-1] + 'ies')  # enemy -> enemies
    elif base_word.endswith(('s', 'sh', 'ch', 'x', 'z')):
        variations.append(base_word + 'es')  # glass -> glasses
    elif base_word.endswith('f'):
        variations.append(base_word[:-1] + 'ves')  # elf -> elves
    elif base_word.endswith('fe'):
        variations.append(base_word[:-2] + 'ves')  # knife -> knives
    else:
        variations.append(base_word + 's')  # card -> cards
    
    # Handle verb forms for action words
    action_words = {
        'attack': ['attacks', 'attacking', 'attacked'],
        'block': ['blocks', 'blocking', 'blocked'], 
        'cast': ['casts', 'casting'],
        'die': ['dies', 'died', 'dying'],
        'enter': ['enters', 'entering', 'entered'],
        'leave': ['leaves', 'leaving', 'left'],
        'sacrifice': ['sacrifices', 'sacrificed', 'sacrificing'],
        'destroy': ['destroys', 'destroyed', 'destroying'],
        'draw': ['draws', 'drawing', 'drew'],
        'discard': ['discards', 'discarded', 'discarding'],
        'tap': ['taps', 'tapped', 'tapping'],
        'untap': ['untaps', 'untapped', 'untapping'],
        'deal': ['deals', 'dealing', 'dealt'],
        'take': ['takes', 'taking', 'took'],
        'gain': ['gains', 'gained', 'gaining'],
        'lose': ['loses', 'lost', 'losing']
    }
    
    if base_word in action_words:
        variations.extend(action_words[base_word])
    
    return variations

def create_regex_patterns(base_words: Set[str], exact_phrases: Set[str]) -> Dict[str, str]:
    """Create regex patterns for flexible matching"""
    
    patterns = {}
    
    # Create patterns for base words with variations
    for word in base_words:
        variations = generate_word_variations(word)
        # Create regex pattern that matches any variation
        pattern = r'\b(?:' + '|'.join(re.escape(var) for var in variations) + r')\b'
        patterns[word] = pattern
    
    # Create exact match patterns for phrases
    for phrase in exact_phrases:
        patterns[phrase] = r'\b' + re.escape(phrase) + r'\b'
    
    return patterns

def extract_keywords_from_text(text: str, patterns: Dict[str, str]) -> Dict[str, int]:
    """Extract keywords from text using the patterns"""
    text_lower = text.lower()
    found_keywords = {}
    
    for keyword, pattern in patterns.items():
        matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
        if matches > 0:
            found_keywords[keyword] = matches
    
    return found_keywords

# Main function to create the simplified keyword system
def create_comprehensive_keyword_list():
    """Create comprehensive but clean keyword list"""
    
    base_words, exact_phrases = create_simplified_keyword_system()
    patterns = create_regex_patterns(base_words, exact_phrases)
    
    # Return both the word lists and patterns for use in matching
    return {
        'base_words': base_words,
        'exact_phrases': exact_phrases,
        'patterns': patterns,
        'total_concepts': len(base_words) + len(exact_phrases)
    }

# Test the system
if __name__ == "__main__":
    keyword_system = create_comprehensive_keyword_list()
    
    print(f"📊 Keyword System Summary:")
    print(f"Base words: {len(keyword_system['base_words'])}")
    print(f"Exact phrases: {len(keyword_system['exact_phrases'])}")
    print(f"Total concepts: {keyword_system['total_concepts']}")
    
    # Test with sample text
    test_text = "Flying, vigilance. When this creature enters the battlefield, create two 1/1 white Soldier creature tokens."
    
    found = extract_keywords_from_text(test_text, keyword_system['patterns'])
    print(f"\n🧪 Test text: {test_text}")
    print(f"Found keywords: {found}")
    
    # Show some examples of base words vs exact phrases
    print(f"\n📝 Sample base words: {sorted(list(keyword_system['base_words']))[:15]}")
    print(f"📝 Sample exact phrases: {sorted([p for p in keyword_system['exact_phrases'] if 'when' in p])[:10]}")

# Helper function for your existing code integration
def get_keyword_patterns():
    """Get the patterns dictionary for use in your existing system"""
    keyword_system = create_comprehensive_keyword_list()
    return keyword_system['patterns']

def save_keywords_to_file(filename="mtg_keywords_20250610v3.txt"):
    """Save the simplified keyword system to file"""
    keyword_system = create_comprehensive_keyword_list()
    
    with open(filename, 'w') as f:
        f.write("# MTG Simplified Keyword System\n\n")
        f.write("## Base Words (auto-generates plurals/variations):\n")
        for word in sorted(keyword_system['base_words']):
            f.write(f"{word}\n")
        
        f.write("\n## Exact Phrases:\n")
        for phrase in sorted(keyword_system['exact_phrases']):
            f.write(f"{phrase}\n")
    
    print(f"💾 Simplified keywords saved to {filename}")
    print(f"📊 Reduced from ~400 to {keyword_system['total_concepts']} core concepts!")

# Uncomment to save:
save_keywords_to_file()

📊 Keyword System Summary:
Base words: 118
Exact phrases: 105
Total concepts: 223

🧪 Test text: Flying, vigilance. When this creature enters the battlefield, create two 1/1 white Soldier creature tokens.
Found keywords: {'token': 1, 'enter': 1, 'soldier': 1, 'battlefield': 1, 'flying': 1, 'creature': 2, 'vigilance': 1, 'this creature': 1, 'enters the battlefield': 1}

📝 Sample base words: ['adapt', 'age', 'angel', 'artifact', 'attach', 'attack', 'aura', 'battalion', 'battlefield', 'beast', 'bird', 'block', 'buyback', 'cascade', 'cast']
📝 Sample exact phrases: ['when this creature attacks', 'when this creature dies', 'whenever this creature attacks', 'whenever this creature dies', 'whenever you cast a creature spell', 'whenever you cast a noncreature spell', 'whenever you cast a spell', 'whenever you cast an instant or sorcery']
💾 Simplified keywords saved to mtg_keywords_20250610v3.txt
📊 Reduced from ~400 to 223 core concepts!


In [21]:
import re
from typing import Set, List, Dict

def create_simplified_keyword_system():
    """Create a clean keyword system using base words + automatic variations"""
    
    # Base single words (we'll handle plurals automatically)
    base_words = {
        # Card types
        'creature', 'artifact', 'enchantment', 'planeswalker', 'land', 'instant', 'sorcery',
        'spell', 'token', 'counter', 'damage', 'mana',
        
        # Zones & actions
        'graveyard', 'battlefield', 'exile', 'library', 'hand',
        'sacrifice', 'destroy', 'draw', 'discard', 'cast', 'target',
        'enter', 'leave', 'attack', 'block', 'die', 'tap', 'untap',
        'search', 'reveal', 'return', 'mill', 'ramp',
        
        # Combat & abilities
        'combat', 'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink',
        'haste', 'hexproof', 'indestructible', 'defender', 'reach', 'menace',
        'prowess', 'flash', 'ward',
        
        # Creature types (tribal)
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'merfolk', 'insect', 'cat', 'dog', 'bird',
        
        # Mechanics
        'proliferate', 'convoke', 'delve', 'cascade', 'storm', 'flashback',
        'buyback', 'rebound', 'cipher', 'dredge', 'evolve', 'explore',
        'fabricate', 'improvise', 'investigate', 'mentor', 'riot', 'adapt',
        'scavenge', 'unleash', 'undying', 'persist', 'modular', 'graft',
        
        # Equipment & auras
        'equip', 'equipment', 'aura', 'enchant', 'attach',
        
        # Counters
        'loyalty', 'charge', 'time', 'energy', 'experience', 'poison',
        'age', 'oil', 'shield', 'stun',
        
        # Keywords
        'partner', 'eminence', 'landfall', 'metalcraft', 'threshold',
        'delirium', 'undergrowth', 'constellation', 'battalion'
    }
    
    # Multi-word phrases that should stay exact
    exact_phrases = {
        # Evergreen abilities
        'first strike', 'double strike', 'split second', 'living weapon',
        'totem armor', 'basic landcycling', 'cumulative upkeep', 'level up',
        
        # Complex triggers
        'enters the battlefield', 'leaves the battlefield',
        'deals combat damage', 'combat damage to a player', 'combat damage to an opponent',
        'when this creature attacks', 'whenever this creature attacks',
        'when this creature dies', 'whenever this creature dies',
        'at the beginning of your upkeep', 'at the beginning of each upkeep',
        'at the beginning of your end step', 'at the beginning of combat',
        'whenever you cast a spell', 'whenever you cast an instant or sorcery',
        'whenever you cast a creature spell', 'whenever you cast a noncreature spell',
        
        # Counter types with +/-
        '+1/+1 counter', '-1/-1 counter', 'loyalty counter', 'charge counter',
        'time counter', 'energy counter', 'experience counter', 'poison counter',
        
        # Card advantage
        'draw a card', 'draw cards', 'discard a card', 'exile a card',
        'return to hand', 'return from graveyard', 'search your library',
        
        # Mana & costs
        'mana cost', 'mana value', 'converted mana cost', 'without paying',
        'costs less to cast', 'add mana', 'mana of any color',
        
        # Token creation
        'create a token', 'creature token', 'artifact token', 'treasure token',
        'food token', 'clue token', 'blood token',
        
        # Card types
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'basic land', 'nonbasic land', 'instant or sorcery', 'noncreature spell',
        
        # Protection & evasion
        'protection from', 'can\'t be targeted', 'can\'t be blocked',
        'can\'t be countered', 'can\'t be destroyed', 'prevent damage',
        
        # Win conditions
        'you win the game', 'target player loses the game', 'commander damage',
        'laboratory maniac', 'thassa\'s oracle',
        
        # Commander specific
        'partner with', 'choose a background', 'command zone', 'commander tax',
        
        # Multiplayer
        'each opponent', 'each player', 'target opponent', 'choose a player',
        
        # Common phrases
        'you control', 'target creature', 'this creature', 'end of turn',
        'until end of turn', 'this turn', 'creature you control'
    }
    
    # Tribal phrases (using "other" pattern)
    tribal_phrases = {f'other {creature}' for creature in [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight'
    ]}
    
    return base_words, exact_phrases | tribal_phrases

def generate_word_variations(base_word: str) -> List[str]:
    """Generate common variations of a base word"""
    variations = [base_word]
    
    # Handle plurals
    if base_word.endswith('y'):
        variations.append(base_word[:-1] + 'ies')  # enemy -> enemies
    elif base_word.endswith(('s', 'sh', 'ch', 'x', 'z')):
        variations.append(base_word + 'es')  # glass -> glasses
    elif base_word.endswith('f'):
        variations.append(base_word[:-1] + 'ves')  # elf -> elves
    elif base_word.endswith('fe'):
        variations.append(base_word[:-2] + 'ves')  # knife -> knives
    else:
        variations.append(base_word + 's')  # card -> cards
    
    # Handle verb forms for action words
    action_words = {
        'attack': ['attacks', 'attacking', 'attacked'],
        'block': ['blocks', 'blocking', 'blocked'], 
        'cast': ['casts', 'casting'],
        'die': ['dies', 'died', 'dying'],
        'enter': ['enters', 'entering', 'entered'],
        'leave': ['leaves', 'leaving', 'left'],
        'sacrifice': ['sacrifices', 'sacrificed', 'sacrificing'],
        'destroy': ['destroys', 'destroyed', 'destroying'],
        'draw': ['draws', 'drawing', 'drew'],
        'discard': ['discards', 'discarded', 'discarding'],
        'tap': ['taps', 'tapped', 'tapping'],
        'untap': ['untaps', 'untapped', 'untapping'],
        'deal': ['deals', 'dealing', 'dealt'],
        'take': ['takes', 'taking', 'took'],
        'gain': ['gains', 'gained', 'gaining'],
        'lose': ['loses', 'lost', 'losing']
    }
    
    if base_word in action_words:
        variations.extend(action_words[base_word])
    
    return variations

def create_regex_patterns(base_words: Set[str], exact_phrases: Set[str]) -> Dict[str, str]:
    """Create regex patterns for flexible matching"""
    
    patterns = {}
    
    # Create patterns for base words with variations
    for word in base_words:
        variations = generate_word_variations(word)
        # Create regex pattern that matches any variation
        pattern = r'\b(?:' + '|'.join(re.escape(var) for var in variations) + r')\b'
        patterns[word] = pattern
    
    # Create exact match patterns for phrases
    for phrase in exact_phrases:
        patterns[phrase] = r'\b' + re.escape(phrase) + r'\b'
    
    return patterns

def extract_keywords_from_text(text: str, patterns: Dict[str, str]) -> Dict[str, int]:
    """Extract keywords from text using the patterns"""
    text_lower = text.lower()
    found_keywords = {}
    
    for keyword, pattern in patterns.items():
        matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
        if matches > 0:
            found_keywords[keyword] = matches
    
    return found_keywords

# Main function to create the simplified keyword system
def create_comprehensive_keyword_list():
    """Create comprehensive but clean keyword list"""
    
    base_words, exact_phrases = create_simplified_keyword_system()
    patterns = create_regex_patterns(base_words, exact_phrases)
    
    # Return both the word lists and patterns for use in matching
    return {
        'base_words': base_words,
        'exact_phrases': exact_phrases,
        'patterns': patterns,
        'total_concepts': len(base_words) + len(exact_phrases)
    }

# Test the system
if __name__ == "__main__":
    keyword_system = create_comprehensive_keyword_list()
    
    print(f"📊 Keyword System Summary:")
    print(f"Base words: {len(keyword_system['base_words'])}")
    print(f"Exact phrases: {len(keyword_system['exact_phrases'])}")
    print(f"Total concepts: {keyword_system['total_concepts']}")
    
    # Test with sample text
    test_text = "Flying, vigilance. When this creature enters the battlefield, create two 1/1 white Soldier creature tokens."
    
    found = extract_keywords_from_text(test_text, keyword_system['patterns'])
    print(f"\n🧪 Test text: {test_text}")
    print(f"Found keywords: {found}")
    
    # Show some examples of base words vs exact phrases
    print(f"\n📝 Sample base words: {sorted(list(keyword_system['base_words']))[:15]}")
    print(f"📝 Sample exact phrases: {sorted([p for p in keyword_system['exact_phrases'] if 'when' in p])[:10]}")

# Helper function for your existing code integration
def get_keyword_patterns():
    """Get the patterns dictionary for use in your existing system"""
    keyword_system = create_comprehensive_keyword_list()
    return keyword_system['patterns']

def save_keywords_to_file(filename="mtg_simplified_keywords20250610.txt"):
    """Save the simplified keyword system to file"""
    keyword_system = create_comprehensive_keyword_list()
    
    with open(filename, 'w') as f:
        f.write("# MTG Simplified Keyword System\n\n")
        f.write("## Base Words (auto-generates plurals/variations):\n")
        for word in sorted(keyword_system['base_words']):
            f.write(f"{word}\n")
        
        f.write("\n## Exact Phrases:\n")
        for phrase in sorted(keyword_system['exact_phrases']):
            f.write(f"{phrase}\n")
    
    print(f"💾 Simplified keywords saved to {filename}")
    print(f"📊 Reduced from ~400 to {keyword_system['total_concepts']} core concepts!")

# Uncomment to save:
save_keywords_to_file()

📊 Keyword System Summary:
Base words: 118
Exact phrases: 105
Total concepts: 223

🧪 Test text: Flying, vigilance. When this creature enters the battlefield, create two 1/1 white Soldier creature tokens.
Found keywords: {'token': 1, 'enter': 1, 'soldier': 1, 'battlefield': 1, 'flying': 1, 'creature': 2, 'vigilance': 1, 'this creature': 1, 'enters the battlefield': 1}

📝 Sample base words: ['adapt', 'age', 'angel', 'artifact', 'attach', 'attack', 'aura', 'battalion', 'battlefield', 'beast', 'bird', 'block', 'buyback', 'cascade', 'cast']
📝 Sample exact phrases: ['when this creature attacks', 'when this creature dies', 'whenever this creature attacks', 'whenever this creature dies', 'whenever you cast a creature spell', 'whenever you cast a noncreature spell', 'whenever you cast a spell', 'whenever you cast an instant or sorcery']
💾 Simplified keywords saved to mtg_simplified_keywords20250610.txt
📊 Reduced from ~400 to 223 core concepts!


In [22]:
# Replace your create_comprehensive_keyword_list() with:
keyword_system = create_comprehensive_keyword_list()
patterns = keyword_system['patterns']

# Use in your feature extraction:
def extract_features(oracle_text):
    return extract_keywords_from_text(oracle_text, patterns)

In [23]:
import re
from typing import Set, List, Dict

# Copy your existing functions here (create_simplified_keyword_system, generate_word_variations, etc.)
def create_simplified_keyword_system():
    """Create a clean keyword system using base words + automatic variations"""
    
    # Base single words (we'll handle plurals automatically)
    base_words = {
        # Card types
        'creature', 'artifact', 'enchantment', 'planeswalker', 'land', 'instant', 'sorcery',
        'spell', 'token', 'counter', 'damage', 'mana',
        
        # Zones & actions
        'graveyard', 'battlefield', 'exile', 'library', 'hand',
        'sacrifice', 'destroy', 'draw', 'discard', 'cast', 'target',
        'enter', 'leave', 'attack', 'block', 'die', 'tap', 'untap',
        'search', 'reveal', 'return', 'mill', 'ramp',
        
        # Combat & abilities
        'combat', 'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink',
        'haste', 'hexproof', 'indestructible', 'defender', 'reach', 'menace',
        'prowess', 'flash', 'ward',
        
        # Creature types (tribal)
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'merfolk', 'insect', 'cat', 'dog', 'bird',
        
        # Mechanics
        'proliferate', 'convoke', 'delve', 'cascade', 'storm', 'flashback',
        'buyback', 'rebound', 'cipher', 'dredge', 'evolve', 'explore',
        'fabricate', 'improvise', 'investigate', 'mentor', 'riot', 'adapt',
        'scavenge', 'unleash', 'undying', 'persist', 'modular', 'graft',
        
        # Equipment & auras
        'equip', 'equipment', 'aura', 'enchant', 'attach',
        
        # Counters
        'loyalty', 'charge', 'time', 'energy', 'experience', 'poison',
        'age', 'oil', 'shield', 'stun',
        
        # Keywords
        'partner', 'eminence', 'landfall', 'metalcraft', 'threshold',
        'delirium', 'undergrowth', 'constellation', 'battalion'
    }
    
    # Multi-word phrases that should stay exact
    exact_phrases = {
        # Evergreen abilities
        'first strike', 'double strike', 'split second', 'living weapon',
        'totem armor', 'basic landcycling', 'cumulative upkeep', 'level up',
        
        # Complex triggers
        'enters the battlefield', 'leaves the battlefield',
        'deals combat damage', 'combat damage to a player', 'combat damage to an opponent',
        'when this creature attacks', 'whenever this creature attacks',
        'when this creature dies', 'whenever this creature dies',
        'at the beginning of your upkeep', 'at the beginning of each upkeep',
        'at the beginning of your end step', 'at the beginning of combat',
        'whenever you cast a spell', 'whenever you cast an instant or sorcery',
        'whenever you cast a creature spell', 'whenever you cast a noncreature spell',
        
        # Counter types with +/-
        '+1/+1 counter', '-1/-1 counter', 'loyalty counter', 'charge counter',
        'time counter', 'energy counter', 'experience counter', 'poison counter',
        
        # Card advantage
        'draw a card', 'draw cards', 'discard a card', 'exile a card',
        'return to hand', 'return from graveyard', 'search your library',
        
        # Mana & costs
        'mana cost', 'mana value', 'converted mana cost', 'without paying',
        'costs less to cast', 'add mana', 'mana of any color',
        
        # Token creation
        'create a token', 'creature token', 'artifact token', 'treasure token',
        'food token', 'clue token', 'blood token',
        
        # Card types
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'basic land', 'nonbasic land', 'instant or sorcery', 'noncreature spell',
        
        # Protection & evasion
        'protection from', 'can\'t be targeted', 'can\'t be blocked',
        'can\'t be countered', 'can\'t be destroyed', 'prevent damage',
        
        # Win conditions
        'you win the game', 'target player loses the game', 'commander damage',
        'laboratory maniac', 'thassa\'s oracle',
        
        # Commander specific
        'partner with', 'choose a background', 'command zone', 'commander tax',
        
        # Multiplayer
        'each opponent', 'each player', 'target opponent', 'choose a player',
        
        # Common phrases
        'you control', 'target creature', 'this creature', 'end of turn',
        'until end of turn', 'this turn', 'creature you control'
    }
    
    # Tribal phrases (using "other" pattern)
    tribal_phrases = {f'other {creature}' for creature in [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight'
    ]}
    
    return base_words, exact_phrases | tribal_phrases

def generate_word_variations(base_word: str) -> List[str]:
    """Generate common variations of a base word"""
    variations = [base_word]
    
    # Handle plurals
    if base_word.endswith('y'):
        variations.append(base_word[:-1] + 'ies')  # enemy -> enemies
    elif base_word.endswith(('s', 'sh', 'ch', 'x', 'z')):
        variations.append(base_word + 'es')  # glass -> glasses
    elif base_word.endswith('f'):
        variations.append(base_word[:-1] + 'ves')  # elf -> elves
    elif base_word.endswith('fe'):
        variations.append(base_word[:-2] + 'ves')  # knife -> knives
    else:
        variations.append(base_word + 's')  # card -> cards
    
    # Handle verb forms for action words
    action_words = {
        'attack': ['attacks', 'attacking', 'attacked'],
        'block': ['blocks', 'blocking', 'blocked'], 
        'cast': ['casts', 'casting'],
        'die': ['dies', 'died', 'dying'],
        'enter': ['enters', 'entering', 'entered'],
        'leave': ['leaves', 'leaving', 'left'],
        'sacrifice': ['sacrifices', 'sacrificed', 'sacrificing'],
        'destroy': ['destroys', 'destroyed', 'destroying'],
        'draw': ['draws', 'drawing', 'drew'],
        'discard': ['discards', 'discarded', 'discarding'],
        'tap': ['taps', 'tapped', 'tapping'],
        'untap': ['untaps', 'untapped', 'untapping'],
        'deal': ['deals', 'dealing', 'dealt'],
        'take': ['takes', 'taking', 'took'],
        'gain': ['gains', 'gained', 'gaining'],
        'lose': ['loses', 'lost', 'losing']
    }
    
    if base_word in action_words:
        variations.extend(action_words[base_word])
    
    return variations

def show_all_keyword_variations():
    """Show all keyword variations that will be generated"""
    
    base_words, exact_phrases = create_simplified_keyword_system()
    
    print("=" * 80)
    print("🔍 ALL KEYWORD VARIATIONS")
    print("=" * 80)
    
    # Show base words with their variations
    print(f"\n📝 BASE WORDS WITH VARIATIONS ({len(base_words)} base words):")
    print("-" * 60)
    
    all_variations = []
    variation_count = 0
    
    for base_word in sorted(base_words):
        variations = generate_word_variations(base_word)
        variation_count += len(variations)
        all_variations.extend(variations)
        
        # Show variations for each word
        if len(variations) > 1:
            print(f"{base_word:15} → {', '.join(variations[1:])}")
        else:
            print(f"{base_word:15} → (no variations)")
    
    # Show exact phrases (no variations)
    print(f"\n📝 EXACT PHRASES ({len(exact_phrases)} phrases):")
    print("-" * 60)
    
    for phrase in sorted(exact_phrases):
        print(f"'{phrase}'")
    
    # Summary statistics
    print(f"\n📊 SUMMARY:")
    print(f"Base words: {len(base_words)}")
    print(f"Total word variations: {variation_count}")
    print(f"Exact phrases: {len(exact_phrases)}")
    print(f"Total searchable terms: {variation_count + len(exact_phrases)}")
    
    return sorted(set(all_variations)), sorted(exact_phrases)

def show_variations_by_category():
    """Show variations organized by category"""
    
    base_words, exact_phrases = create_simplified_keyword_system()
    
    print("=" * 80)
    print("📂 KEYWORD VARIATIONS BY CATEGORY")
    print("=" * 80)
    
    # Define categories
    categories = {
        'Card Types': ['creature', 'artifact', 'enchantment', 'planeswalker', 'land', 'instant', 'sorcery', 'spell', 'token'],
        'Actions': ['sacrifice', 'destroy', 'draw', 'discard', 'cast', 'target', 'enter', 'leave', 'attack', 'block', 'die', 'tap', 'untap'],
        'Combat Abilities': ['flying', 'trample', 'vigilance', 'deathtouch', 'lifelink', 'haste', 'hexproof', 'indestructible', 'defender'],
        'Creature Types': ['human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon', 'beast', 'spirit'],
        'Mechanics': ['proliferate', 'convoke', 'delve', 'cascade', 'storm', 'flashback', 'buyback', 'rebound'],
        'Counter Types': ['loyalty', 'charge', 'time', 'energy', 'experience', 'poison', 'age', 'oil']
    }
    
    for category, words in categories.items():
        print(f"\n🏷️  {category.upper()}:")
        print("-" * 40)
        
        for word in words:
            if word in base_words:
                variations = generate_word_variations(word)
                if len(variations) > 1:
                    print(f"  {word:12} → {', '.join(variations)}")
                else:
                    print(f"  {word:12} → {word}")

def save_all_variations_to_file(filename="mtg_all_keyword_variations.txt"):
    """Save all keyword variations to a file"""
    
    all_word_variations, exact_phrases = show_all_keyword_variations()
    
    with open(filename, 'w') as f:
        f.write("# MTG All Keyword Variations\n")
        f.write("# Generated automatically from base words + exact phrases\n\n")
        
        f.write("## All Word Variations:\n")
        for variation in all_word_variations:
            f.write(f"{variation}\n")
        
        f.write(f"\n## Exact Phrases:\n")
        for phrase in exact_phrases:
            f.write(f"{phrase}\n")
        
        f.write(f"\n## Summary:\n")
        f.write(f"# Total word variations: {len(all_word_variations)}\n")
        f.write(f"# Total exact phrases: {len(exact_phrases)}\n")
        f.write(f"# Total searchable terms: {len(all_word_variations) + len(exact_phrases)}\n")
    
    print(f"\n💾 All variations saved to {filename}")

def test_keyword_matching():
    """Test the keyword matching with sample MTG text"""
    
    # Test cases
    test_cases = [
        "Flying, vigilance. When this creature enters the battlefield, create two 1/1 white Soldier creature tokens.",
        "Sacrifice a creature: Draw a card.",
        "Whenever you cast a spell, proliferate.",
        "Other elves you control get +1/+1.",
        "At the beginning of your upkeep, put a +1/+1 counter on target creature."
    ]
    
    base_words, exact_phrases = create_simplified_keyword_system()
    all_variations, _ = show_all_keyword_variations()
    
    print("\n🧪 TESTING KEYWORD MATCHING:")
    print("=" * 60)
    
    for i, test_text in enumerate(test_cases, 1):
        print(f"\nTest {i}: {test_text}")
        print("Matches found:")
        
        # Check word variations
        found_words = []
        for word in all_variations:
            if re.search(r'\b' + re.escape(word) + r'\b', test_text.lower()):
                found_words.append(word)
        
        # Check exact phrases
        found_phrases = []
        for phrase in exact_phrases:
            if re.search(r'\b' + re.escape(phrase) + r'\b', test_text.lower()):
                found_phrases.append(phrase)
        
        if found_words:
            print(f"  Words: {', '.join(found_words)}")
        if found_phrases:
            print(f"  Phrases: {', '.join(found_phrases)}")
        if not found_words and not found_phrases:
            print("  No matches found")

if __name__ == "__main__":
    # Show all variations
    show_all_keyword_variations()
    
    print("\n" + "="*80)
    
    # Show by category
    show_variations_by_category()
    
    print("\n" + "="*80)
    
    # Test matching
    test_keyword_matching()
    
    # Uncomment to save all variations to file
save_all_variations_to_file()

🔍 ALL KEYWORD VARIATIONS

📝 BASE WORDS WITH VARIATIONS (118 base words):
------------------------------------------------------------
adapt           → adapts
age             → ages
angel           → angels
artifact        → artifacts
attach          → attaches
attack          → attacks, attacks, attacking, attacked
aura            → auras
battalion       → battalions
battlefield     → battlefields
beast           → beasts
bird            → birds
block           → blocks, blocks, blocking, blocked
buyback         → buybacks
cascade         → cascades
cast            → casts, casts, casting
cat             → cats
charge          → charges
cipher          → ciphers
combat          → combats
constellation   → constellations
convoke         → convokes
counter         → counters
creature        → creatures
damage          → damages
deathtouch      → deathtouches
defender        → defenders
delirium        → deliriums
delve           → delves
demon           → demons
destroy         → destro

In [25]:
test_keyword_matching()

🔍 ALL KEYWORD VARIATIONS

📝 BASE WORDS WITH VARIATIONS (118 base words):
------------------------------------------------------------
adapt           → adapts
age             → ages
angel           → angels
artifact        → artifacts
attach          → attaches
attack          → attacks, attacks, attacking, attacked
aura            → auras
battalion       → battalions
battlefield     → battlefields
beast           → beasts
bird            → birds
block           → blocks, blocks, blocking, blocked
buyback         → buybacks
cascade         → cascades
cast            → casts, casts, casting
cat             → cats
charge          → charges
cipher          → ciphers
combat          → combats
constellation   → constellations
convoke         → convokes
counter         → counters
creature        → creatures
damage          → damages
deathtouch      → deathtouches
defender        → defenders
delirium        → deliriums
delve           → delves
demon           → demons
destroy         → destro

In [34]:
# update keywords here
import re
from typing import Set, List, Dict

def create_simplified_keyword_system():
    """Create a clean keyword system using base words + automatic variations"""
    
    # Base single words (we'll handle plurals automatically)
    base_words = {
        # Card types
        'creature', 'artifact', 'enchantment', 'planeswalker', 'land', 'instant', 'sorcery',
        'spell', 'token', 'counter', 'damage', 'mana',
        
        # Zones & actions
        'graveyard', 'battlefield', 'exile', 'library', 'hand',
        'sacrifice', 'destroy', 'draw', 'discard', 'cast', 'target',
        'enter', 'leave', 'attack', 'block', 'die', 'tap', 'untap',
        'search', 'reveal', 'return', 'mill', 'ramp',
        
        # Combat & abilities
        'combat', 'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink',
        'haste', 'hexproof', 'indestructible', 'defender', 'reach', 'menace',
        'prowess', 'flash', 'ward',
        
        # Creature types (tribal)
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'merfolk', 'insect', 'cat', 'dog', 'bird', 'wall',
        
        # Mechanics
        'proliferate', 'convoke', 'delve', 'cascade', 'storm', 'flashback',
        'buyback', 'rebound', 'cipher', 'dredge', 'evolve', 'explore',
        'fabricate', 'improvise', 'investigate', 'mentor', 'riot', 'adapt',
        'scavenge', 'unleash', 'undying', 'persist', 'modular', 'graft', 
        'toxic', 'skulk', 'shadow',
        
        # Equipment & auras
        'equip', 'equipment', 'aura', 'enchant', 'attach', 'unattach',
        
        # Counters
        'loyalty', 'charge', 'time', 'energy', 'experience', 'poison',
        'age', 'oil', 'shield', 'stun',
        
        # Keywords
        'partner', 'eminence', 'landfall', 'metalcraft', 'threshold',
        'delirium', 'undergrowth', 'constellation', 'battalion',

        # Manual
        'toughness', 'power'
    }
    
    # Multi-word phrases that should stay exact
    exact_phrases = {
        # Evergreen abilities
        'first strike', 'double strike', 'split second', 'living weapon',
        'totem armor', 'basic landcycling', 'cumulative upkeep', 'level up',
        
        # Complex triggers
        'enters the battlefield', 'leaves the battlefield',
        'deals combat damage', 'combat damage to a player', 'combat damage to an opponent',
        'when this creature attacks', 'whenever this creature attacks',
        'when this creature dies', 'whenever this creature dies',
        'at the beginning of your upkeep', 'at the beginning of each upkeep',
        'at the beginning of your end step', 'at the beginning of combat',
        'whenever you cast a spell', 'whenever you cast an instant or sorcery',
        'whenever you cast a creature spell', 'whenever you cast a noncreature spell',
        
        # Counter types with +/-
        '+1/+1 counter', '-1/-1 counter', 'loyalty counter', 'charge counter',
        'time counter', 'energy counter', 'experience counter', 'poison counter',
        
        # Card advantage
        'draw a card', 'draw cards', 'discard a card', 'exile a card',
        'return to hand', 'return from graveyard', 'search your library',
        
        # Mana & costs
        'mana cost', 'mana value', 'converted mana cost', 'without paying',
        'costs less to cast', 'add mana', 'mana of any color',
        
        # Token creation
        'create a token', 'creature token', 'artifact token', 'treasure token',
        'food token', 'clue token', 'blood token',
        
        # Card types
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'basic land', 'nonbasic land', 'instant or sorcery', 'noncreature spell',
        
        # Protection & evasion
        'protection from', 'can\'t be targeted', 'can\'t be blocked',
        'can\'t be countered', 'can\'t be destroyed', 'prevent damage',
        
        # Win conditions
        'you win the game', 'target player loses the game', 'commander damage',
        'laboratory maniac', 'thassa\'s oracle',
        
        # Commander specific
        'partner with', 'choose a background', 'command zone', 'commander tax',
        
        # Multiplayer
        'each opponent', 'each player', 'target opponent', 'choose a player',
        
        # Common phrases
        'you control', 'target creature', 'this creature', 'end of turn',
        'until end of turn', 'this turn', 'creature you control'
    }
    
    # Tribal phrases (using "other" pattern)
    tribal_phrases = {f'other {creature}' for creature in [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight'
    ]}
    
    return base_words, exact_phrases | tribal_phrases

def generate_word_variations(base_word: str) -> List[str]:
    """Generate common variations of a base word"""
    variations = [base_word]
    
    # Handle plurals
    if base_word.endswith('y'):
        variations.append(base_word[:-1] + 'ies')  # enemy -> enemies
    elif base_word.endswith(('s', 'sh', 'ch', 'x', 'z')):
        variations.append(base_word + 'es')  # glass -> glasses
    elif base_word.endswith('f'):
        variations.append(base_word[:-1] + 'ves')  # elf -> elves
    elif base_word.endswith('fe'):
        variations.append(base_word[:-2] + 'ves')  # knife -> knives
    else:
        variations.append(base_word + 's')  # card -> cards
    
    # Handle verb forms for action words
    action_words = {
        'attack': ['attacks', 'attacking', 'attacked'],
        'block': ['blocks', 'blocking', 'blocked'], 
        'cast': ['casts', 'casting'],
        'die': ['dies', 'died', 'dying'],
        'enter': ['enters', 'entering', 'entered'],
        'leave': ['leaves', 'leaving', 'left'],
        'sacrifice': ['sacrifices', 'sacrificed', 'sacrificing'],
        'destroy': ['destroys', 'destroyed', 'destroying'],
        'draw': ['draws', 'drawing', 'drew'],
        'discard': ['discards', 'discarded', 'discarding'],
        'tap': ['taps', 'tapped', 'tapping'],
        'untap': ['untaps', 'untapped', 'untapping'],
        'deal': ['deals', 'dealing', 'dealt'],
        'take': ['takes', 'taking', 'took'],
        'gain': ['gains', 'gained', 'gaining'],
        'lose': ['loses', 'lost', 'losing']
    }
    
    if base_word in action_words:
        variations.extend(action_words[base_word])
    
    return variations

def get_all_keywords_as_arrays():
    """
    Generate all keywords and return them as organized arrays
    """
    base_words, exact_phrases = create_simplified_keyword_system()
    
    # Convert base words to list and generate all variations
    all_single_words = []
    for word in base_words:
        variations = generate_word_variations(word)
        all_single_words.extend(variations)
    
    # Convert exact phrases to list
    all_phrases = list(exact_phrases)
    
    # Combine everything into one master list
    all_keywords = all_single_words + all_phrases
    
    # Remove duplicates and sort
    all_keywords = sorted(list(set(all_keywords)))
    all_single_words = sorted(list(set(all_single_words)))
    all_phrases = sorted(all_phrases)
    
    return {
        'all_keywords': all_keywords,           # Everything combined
        'single_words': all_single_words,       # Just single words with variations
        'phrases': all_phrases,                 # Just multi-word phrases
        'base_words': sorted(list(base_words))  # Original base words only
    }

In [36]:
# update keywords here NEW
import re
from typing import Set, List, Dict

def create_simplified_keyword_system():
    """Create a clean keyword system using base words + automatic variations"""
    
    # Base single words (we'll handle plurals automatically)
    base_words = {
        # Card types
        'creature', 'artifact', 'enchantment', 'planeswalker', 'land', 'instant', 'sorcery',
        'spell', 'token', 'counter', 'damage', 'mana', 'tribal', 'legendary', 'basic', 'snow',
        
        # Zones & actions
        'graveyard', 'battlefield', 'exile', 'library', 'hand',
        'sacrifice', 'destroy', 'draw', 'discard', 'cast', 'target',
        'enter', 'leave', 'attack', 'block', 'die', 'tap', 'untap',
        'search', 'reveal', 'return', 'mill', 'ramp', 'scry', 'surveil',
        'look', 'reveal', 'shuffle', 'choose', 'prevent', 'redirect',
        
        # Combat & abilities
        'combat', 'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink',
        'haste', 'hexproof', 'indestructible', 'defender', 'reach', 'menace',
        'prowess', 'flash', 'ward', 'shroud', 'fear', 'intimidate', 'unblockable',
        'regenerate', 'crew', 'equip', 'enchant', 'attach', 'protection',
        
        # Creature types (tribal)
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'merfolk', 'insect', 'cat', 'dog', 'bird', 'wall', 'rogue', 'cleric', 
        'shaman', 'treefolk', 'avatar', 'horror', 'eldrazi', 'phyrexian', 'sliver', 
        'ally', 'fish', 'spider', 'snake', 'dinosaur', 'giant', 'faerie',
        
        # Mechanics - Original
        'proliferate', 'convoke', 'delve', 'cascade', 'storm', 'flashback',
        'buyback', 'rebound', 'cipher', 'dredge', 'evolve', 'explore',
        'fabricate', 'improvise', 'investigate', 'mentor', 'riot', 'adapt',
        'scavenge', 'unleash', 'undying', 'persist', 'modular', 'graft', 
        'toxic', 'skulk', 'shadow',
        
        # Mechanics - Added from V2
        'absorb', 'affinity', 'amplify', 'annihilator', 'ascend', 'assist', 'awaken',
        'backup', 'banding', 'bestow', 'bloodthirst', 'bushido', 'champion', 'changeling',
        'clash', 'cleave', 'conspire', 'crew', 'cycling', 'dash', 'daybound', 'demonstrate',
        'dethrone', 'devoid', 'devour', 'disturb', 'echo', 'emerge', 'embalm', 'encore',
        'entwine', 'epic', 'escape', 'eternalize', 'evoke', 'exalted', 'exploit',
        'extort', 'fading', 'flanking', 'forecast', 'fortify', 'frenzy', 'gravestorm',
        'habitat', 'hideaway', 'horsemanship', 'infect', 'kicker', 'learn', 'madness',
        'melee', 'miracle', 'morph', 'multikicker', 'mutate', 'nightbound', 'ninjutsu',
        'offering', 'outlast', 'overload', 'partner', 'phasing', 'poisonous', 'provoke',
        'rampage', 'reconfigure', 'recover', 'reinforce', 'renown', 'replicate', 'retrace',
        'ripple', 'soulbond', 'soulshift', 'splice', 'sunburst', 'suspend', 'training',
        'transform', 'transmute', 'tribute', 'undaunted', 'unearth', 'vanishing', 'venture',
        'wither', 'populate', 'magecraft', 'fork', 'vote',
        
        # Equipment & auras
        'equip', 'equipment', 'aura', 'enchant', 'attach', 'unattach', 'vehicle', 'treasure',
        'saga', 'curse', 'enchanted',
        
        # Counters
        'loyalty', 'charge', 'time', 'energy', 'experience', 'poison',
        'age', 'oil', 'shield', 'stun',
        
        # Keywords & Ability Words - Original
        'partner', 'eminence', 'landfall', 'metalcraft', 'threshold',
        'delirium', 'undergrowth', 'constellation', 'battalion',
        
        # Ability Words - Added from V2
        'addendum', 'alliance', 'bloodrush', 'channel', 'chroma', 'cohort', 'converge',
        'domain', 'enrage', 'ferocious', 'formidable', 'grandeur', 'hellbent', 'heroic',
        'imprint', 'inspired', 'kinship', 'lieutenant', 'morbid', 'parley', 'radiance',
        'raid', 'rally', 'revolt', 'strive', 'sweep',

        # Manual
        'toughness', 'power', 'colorless', 'colored', 'food', 'clue', 'blood', 'gold',
        'reanimate', 'infinite', 'combo', 'superfriends', 'spellslinger', 'ramp'
    }
    
    # Multi-word phrases that should stay exact
    exact_phrases = {
        # Evergreen abilities
        'first strike', 'double strike', 'split second', 'living weapon',
        'totem armor', 'basic landcycling', 'cumulative upkeep', 'level up',
        'plainscycling',
        
        # Complex triggers
        'enters the battlefield', 'leaves the battlefield', 'when this enters the battlefield',
        'whenever a creature enters the battlefield', 'when another creature enters the battlefield',
        'enters the battlefield tapped', 'enters the battlefield with',
        'enters the battlefield under your control',
        'deals combat damage', 'combat damage to a player', 'combat damage to an opponent',
        'when this creature attacks', 'whenever this creature attacks',
        'when this creature dies', 'whenever this creature dies',
        'when this creature becomes blocked', 'whenever this creature becomes blocked',
        'when this creature deals combat damage', 'whenever this creature deals combat damage',
        'at the beginning of your upkeep', 'at the beginning of each upkeep',
        'at the beginning of your end step', 'at the beginning of each end step',
        'at the beginning of combat', 'at the beginning of your draw step',
        'at the beginning of each player\'s upkeep', 'during each player\'s turn',
        'whenever you cast a spell', 'whenever you cast an instant or sorcery',
        'whenever you cast a creature spell', 'whenever you cast a noncreature spell',
        'whenever you cast your first spell', 'whenever you cast your second spell',
        'once each turn', 'during your turn', 'on your turn',
        
        # Combat phrases from V2
        'attacking creature', 'blocking creature', 'deals combat damage', 'when attacks', 'when blocks',
        'whenever attacks', 'whenever blocks', 'attack alone', 'can attack', 'must attack',
        'cannot attack', 'can block', 'cannot block', 'unblocked creature', 'attacking or blocking',
        'deals damage to a creature', 'deals damage to a player', 'deals damage equal to its power',
        'can\'t be blocked', 'must be blocked', 'blocks or becomes blocked', 'first strike damage',
        'double strike damage',
        
        # Counter types with +/-
        '+1/+1 counter', '-1/-1 counter', 'loyalty counter', 'charge counter',
        'time counter', 'energy counter', 'experience counter', 'poison counter',
        'age counter', 'oil counter', 'shield counter', 'stun counter',
        '+1/+1 counters', 'charge counters', 'loyalty counters', 'time counters',
        'age counters', 'poison counters', 'energy counters', 'experience counters',
        'oil counters', 'shield counters', 'stun counters', 'remove a counter',
        'place a counter', 'put a counter', 'with counters', 'counter on it',
        'counters on it', 'number of counters',
        
        # Card advantage
        'draw a card', 'draw cards', 'draw two cards', 'draw three cards', 'draw additional',
        'discard a card', 'discard cards', 'exile a card', 'exile cards',
        'return to hand', 'return from graveyard', 'search your library',
        'look at the top card', 'look at the top X cards',
        
        # Mana & costs
        'mana cost', 'mana value', 'converted mana cost', 'without paying',
        'costs less to cast', 'costs less', 'costs more', 'reduce the cost',
        'add mana', 'mana of any color', 'pay mana', 'spend mana',
        'colorless mana', 'colored mana', 'additional cost',
        
        # Token creation
        'create a token', 'create a creature token', 'create a treasure token',
        'create a food token', 'create a clue token', 'create a blood token',
        'creature token', 'artifact token', 'treasure token',
        'food token', 'clue token', 'blood token', 'gold token', 'token creature',
        
        # Card types
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'legendary sorcery', 'basic land', 'nonbasic land', 'instant or sorcery', 
        'noncreature spell', 'instant spell', 'sorcery spell', 'creature spell',
        'noncreature artifact', 'enchantment token', 'artifact token',
        
        # Protection & evasion
        'protection from', 'can\'t be targeted', 'can\'t be blocked',
        'can\'t be countered', 'can\'t be destroyed', 'prevent damage',
        'prevent all damage', 'redirect damage', 'damage prevention',
        
        # Win conditions
        'you win the game', 'target player loses the game', 'commander damage',
        'laboratory maniac', 'thassa\'s oracle', 'loses the game', 'wins the game',
        'can\'t lose the game', 'can\'t win the game', 'alternate win condition',
        'infinite combo', 'goes infinite', 'poison counters', 'approach of the second sun',
        
        # Commander specific
        'partner with', 'choose a background', 'command zone', 'commander tax',
        'choose a creature type', 'of the chosen type', 'shares a creature type',
        'tribal spell', 'background',
        
        # Multiplayer
        'each opponent', 'each player', 'target opponent', 'choose a player',
        'council\'s dilemma', 'tempting offer', 'will of the council', 'join forces',
        
        # Common phrases
        'you control', 'target creature', 'this creature', 'end of turn',
        'until end of turn', 'this turn', 'creature you control', 'this spell',
        'this card', 'cast this', 'when enters', 'when dies', 'when attacks',
        'when blocks', 'whenever', 'at end of', 'whenever you draw', 'whenever deals damage',
        'whenever takes damage', 'when you gain life', 'when you lose life',
        'whenever discards', 'whenever sacrifices', 'when enters', 'when this enters',
        'whenever enters', 'whenever this enters', 'when leaves', 'when this leaves',
        'whenever leaves', 'whenever this leaves', 'during each',
        'when you cast', 'whenever you cast', 'when you draw', 'whenever you draw',
        'when you gain life', 'whenever you gain life', 'when you lose life',
        'whenever you lose life', 'when becomes tapped', 'whenever becomes tapped',
        'when becomes untapped', 'whenever becomes untapped',
        
        # Zone manipulation
        'from graveyard', 'to graveyard', 'from exile', 'to exile',
        'from your hand', 'to your hand', 'from library', 'search library',
        'shuffle library', 'top of library', 'bottom of library', 'return to battlefield',
        'enters from graveyard', 'return from graveyard', 'cast from graveyard',
        'graveyard to battlefield', 'self-mill', 'graveyard matters',
        
        # Sacrifice and destruction
        'destroy target', 'sacrifice creature', 'sacrifice artifact',
        'sacrifice enchantment', 'sacrifice land', 'when sacrificed',
        'when destroyed', 'destroy all', 'sacrifice all',
        
        # Triggered abilities
        'whenever you cast', 'whenever you draw', 'whenever deals damage',
        'whenever takes damage', 'when you gain life', 'when you lose life',
        'whenever discards', 'whenever sacrifices',
        
        # Artifact/Equipment synergies
        'affinity for artifacts', 'artifact enters', 'artifact dies',
        'metalcraft', 'improvise', 'crew', 'attach',
        
        # Enchantment synergies  
        'constellation', 'enchantment enters', 'bestow', 'totem armor',
        
        # Planeswalker synergies
        'loyalty', 'loyalty ability', 'activate loyalty abilities',
        'plus ability', 'minus ability', 'ultimate ability', 'doubling season',
        'spark double',
        
        # Land synergies
        'landfall', 'enters tapped', 'sacrifice a land', 'search for a land',
        'land enters', 'land drops', 'additional land', 'extra land',
        'mana dork', 'mana rock', 'land ramp',
        
        # Evasion
        'unblockable', 'must be blocked', 'shadow', 'fear', 'intimidate', 'skulk',
        
        # Activated abilities
        'tap:', 'untap:', 'put into graveyard'
    }
    
    # Tribal phrases (using "other" pattern) - expanded
    tribal_creatures = [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'rogue', 'cleric', 'shaman', 'treefolk', 'avatar', 'horror', 'eldrazi', 
        'phyrexian', 'sliver', 'ally', 'fish', 'spider', 'snake', 'dinosaur', 
        'giant', 'faerie'
    ]
    
    tribal_phrases = {f'other {creature}' for creature in tribal_creatures}
    
    return base_words, exact_phrases | tribal_phrases

def generate_word_variations(base_word: str) -> List[str]:
    """Generate common variations of a base word"""
    variations = [base_word]
    
    # Handle plurals
    if base_word.endswith('y'):
        variations.append(base_word[:-1] + 'ies')  # enemy -> enemies
    elif base_word.endswith(('s', 'sh', 'ch', 'x', 'z')):
        variations.append(base_word + 'es')  # glass -> glasses
    elif base_word.endswith('f'):
        variations.append(base_word[:-1] + 'ves')  # elf -> elves
    elif base_word.endswith('fe'):
        variations.append(base_word[:-2] + 'ves')  # knife -> knives
    else:
        variations.append(base_word + 's')  # card -> cards
    
    # Handle verb forms for action words
    action_words = {
        'attack': ['attacks', 'attacking', 'attacked'],
        'block': ['blocks', 'blocking', 'blocked'], 
        'cast': ['casts', 'casting'],
        'die': ['dies', 'died', 'dying'],
        'enter': ['enters', 'entering', 'entered'],
        'leave': ['leaves', 'leaving', 'left'],
        'sacrifice': ['sacrifices', 'sacrificed', 'sacrificing'],
        'destroy': ['destroys', 'destroyed', 'destroying'],
        'draw': ['draws', 'drawing', 'drew'],
        'discard': ['discards', 'discarded', 'discarding'],
        'tap': ['taps', 'tapped', 'tapping'],
        'untap': ['untaps', 'untapped', 'untapping'],
        'deal': ['deals', 'dealing', 'dealt'],
        'take': ['takes', 'taking', 'took'],
        'gain': ['gains', 'gained', 'gaining'],
        'lose': ['loses', 'lost', 'losing'],
        'look': ['looks', 'looking', 'looked'],
        'reveal': ['reveals', 'revealing', 'revealed'],
        'search': ['searches', 'searching', 'searched'],
        'shuffle': ['shuffles', 'shuffling', 'shuffled'],
        'choose': ['chooses', 'choosing', 'chose'],
        'prevent': ['prevents', 'preventing', 'prevented'],
        'redirect': ['redirects', 'redirecting', 'redirected']
    }
    
    if base_word in action_words:
        variations.extend(action_words[base_word])
    
    return variations

def get_all_keywords_as_arrays():
    """
    Generate all keywords and return them as organized arrays
    """
    base_words, exact_phrases = create_simplified_keyword_system()
    
    # Convert base words to list and generate all variations
    all_single_words = []
    for word in base_words:
        variations = generate_word_variations(word)
        all_single_words.extend(variations)
    
    # Convert exact phrases to list
    all_phrases = list(exact_phrases)
    
    # Combine everything into one master list
    all_keywords = all_single_words + all_phrases
    
    # Remove duplicates and sort
    all_keywords = sorted(list(set(all_keywords)))
    all_single_words = sorted(list(set(all_single_words)))
    all_phrases = sorted(all_phrases)
    
    return {
        'all_keywords': all_keywords,           # Everything combined
        'single_words': all_single_words,       # Just single words with variations
        'phrases': all_phrases,                 # Just multi-word phrases
        'base_words': sorted(list(base_words))  # Original base words only
    }

In [40]:
keyword_arrays = get_all_keywords_as_arrays()
print("All keywords:", keyword_arrays['all_keywords'])

All keywords: ['+1/+1 counter', '+1/+1 counters', '-1/-1 counter', 'absorb', 'absorbs', 'activate loyalty abilities', 'adapt', 'adapts', 'add mana', 'addendum', 'addendums', 'additional cost', 'additional land', 'affinities', 'affinity', 'affinity for artifacts', 'age', 'age counter', 'age counters', 'ages', 'alliance', 'alliances', 'allies', 'ally', 'alternate win condition', 'amplifies', 'amplify', 'angel', 'angels', 'annihilator', 'annihilators', 'approach of the second sun', 'artifact', 'artifact creature', 'artifact dies', 'artifact enters', 'artifact token', 'artifacts', 'ascend', 'ascends', 'assist', 'assists', 'at end of', 'at the beginning of combat', 'at the beginning of each end step', "at the beginning of each player's upkeep", 'at the beginning of each upkeep', 'at the beginning of your draw step', 'at the beginning of your end step', 'at the beginning of your upkeep', 'attach', 'attaches', 'attack', 'attack alone', 'attacked', 'attacking', 'attacking creature', 'attacking

In [41]:
comprehensive_keywords = keyword_arrays['all_keywords']

In [42]:
def create_comprehensive_keyword_features(df, keywords):
    """Create keyword features with comprehensive multi-word list"""
    keyword_matrix = []
    
    print(f"Creating features with {len(keywords)} keywords...")
    
    for idx, row in df.iterrows():
        if idx % 5000 == 0:
            print(f"  Processing card {idx}/{len(df)}")
            
        text = row['combined_text'].lower()
        features = []
        
        for keyword in keywords:
            # Count occurrences (cap at 3)
            count = text.count(keyword.lower())
            features.append(min(count, 3))
            
        keyword_matrix.append(features)
    
    return np.array(keyword_matrix), keywords

# Create comprehensive keyword matrix
print("Creating comprehensive keyword matrix...")
comprehensive_matrix, final_keywords = create_comprehensive_keyword_features(df_clean, comprehensive_keywords)
print(f"Matrix shape: {comprehensive_matrix.shape}")

# Test with Isshin and other commanders
test_commanders = ['Isshin']

for commander_name in test_commanders:
    matches = df_clean[df_clean['name'].str.contains(commander_name, case=False, na=False)]
    if len(matches) > 0:
        idx = matches.index[0]
        features = comprehensive_matrix[idx]
        matched_keywords = [final_keywords[i] for i in range(len(final_keywords)) if features[i] > 0]
        
        print(f"\n⚔️ {matches.iloc[0]['name']}:")
        print(f"   Matched keywords ({len(matched_keywords)}): {matched_keywords[:15]}...")
        print(f"   Total feature count: {sum(features)}")
    else:
        print(f"\n❌ {commander_name} not found")

Creating comprehensive keyword matrix...
Creating features with 931 keywords...
  Processing card 0/27290
  Processing card 5000/27290
  Processing card 10000/27290
  Processing card 15000/27290
  Processing card 20000/27290
  Processing card 25000/27290
Matrix shape: (27290, 931)

⚔️ Isshin, Two Heavens as One:
   Matched keywords (8): ['attack', 'attacking', 'creature', 'human', 'legendary', 'legendary creature', 'time', 'you control']...
   Total feature count: 9


In [44]:
# Save the enhanced model with comprehensive keywords
import pickle

print("Saving enhanced model with comprehensive keywords...")

# Prepare model data
enhanced_model_data = {
    'df_clean': df_clean,
    'keyword_matrix': comprehensive_matrix,
    'keywords': final_keywords,
    'model_version': 'v2_comprehensive_keywords',
    'total_keywords': len(final_keywords),
    'total_cards': len(df_clean)
}

# Save to file
with open('data/mtg_model_enhanced.pkl', 'wb') as f:
    pickle.dump(enhanced_model_data, f)

print(f"✅ Enhanced model saved!")
print(f"   - {len(df_clean)} cards")
print(f"   - {len(final_keywords)} keywords")
print(f"   - Matrix size: {comprehensive_matrix.shape}")

# Save keyword list as text file for reference
with open('data/comprehensive_keywords.txt', 'w') as f:
    for i, keyword in enumerate(final_keywords):
        f.write(f"{i+1:3d}. {keyword}\n")

print("✅ Keyword list saved to comprehensive_keywords.txt")

Saving enhanced model with comprehensive keywords...
✅ Enhanced model saved!
   - 27290 cards
   - 931 keywords
   - Matrix size: (27290, 931)
✅ Keyword list saved to comprehensive_keywords.txt
