In [24]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer
from typing import Set, List, Dict

df_clean = pd.read_csv('data/mtg_cards_clean.csv', low_memory=False)

In [25]:
# update keywords here

def create_keyword_system():
    """Create a keyword system using base words + automatic variations, phrases, and tribal"""
    
    # Base single words
    base_words = {
        # Card types
        'creature', 'artifact', 'enchantment', 'planeswalker', 'land', 'instant', 'sorcery',
        'spell', 'token', 'counter', 'damage', 'mana', 'tribal', 'legendary', 'basic', 'snow',
        
        # Zones & actions
        'graveyard', 'battlefield', 'exile', 'library', 'hand',
        'sacrifice', 'destroy', 'draw', 'discard', 'cast', 'target',
        'enter', 'leave', 'attack', 'block', 'die', 'tap', 'untap',
        'search', 'reveal', 'return', 'mill', 'ramp', 'scry', 'surveil',
        'look', 'reveal', 'shuffle', 'choose', 'prevent', 'redirect',
        
        # Combat & abilities
        'combat', 'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink',
        'haste', 'hexproof', 'indestructible', 'defender', 'reach', 'menace',
        'prowess', 'flash', 'ward', 'shroud', 'fear', 'intimidate', 'unblockable',
        'regenerate', 'crew', 'equip', 'enchant', 'attach', 'protection',
        
        # Creature types (tribal)
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'merfolk', 'insect', 'cat', 'dog', 'bird', 'wall', 'rogue', 'cleric', 
        'shaman', 'treefolk', 'avatar', 'horror', 'eldrazi', 'phyrexian', 'sliver', 
        'ally', 'fish', 'spider', 'snake', 'dinosaur', 'giant', 'faerie',
        
        # Mechanics
        'proliferate', 'convoke', 'delve', 'cascade', 'storm', 'flashback',
        'buyback', 'rebound', 'cipher', 'dredge', 'evolve', 'explore',
        'fabricate', 'improvise', 'investigate', 'mentor', 'riot', 'adapt',
        'scavenge', 'unleash', 'undying', 'persist', 'modular', 'graft', 
        'toxic', 'skulk', 'shadow', 'wither', 'populate', 'magecraft', 'fork', 'vote',
        'absorb', 'affinity', 'amplify', 'annihilator', 'ascend', 'assist', 'awaken',
        'backup', 'banding', 'bestow', 'bloodthirst', 'bushido', 'champion', 'changeling',
        'clash', 'cleave', 'conspire', 'crew', 'cycling', 'dash', 'daybound', 'demonstrate',
        'dethrone', 'devoid', 'devour', 'disturb', 'echo', 'emerge', 'embalm', 'encore',
        'entwine', 'epic', 'escape', 'eternalize', 'evoke', 'exalted', 'exploit',
        'extort', 'fading', 'flanking', 'forecast', 'fortify', 'frenzy', 'gravestorm',
        'habitat', 'hideaway', 'horsemanship', 'infect', 'kicker', 'learn', 'madness',
        'melee', 'miracle', 'morph', 'multikicker', 'mutate', 'nightbound', 'ninjutsu',
        'offering', 'outlast', 'overload', 'partner', 'phasing', 'poisonous', 'provoke',
        'rampage', 'reconfigure', 'recover', 'reinforce', 'renown', 'replicate', 'retrace',
        'ripple', 'soulbond', 'soulshift', 'splice', 'sunburst', 'suspend', 'training',
        'transform', 'transmute', 'tribute', 'undaunted', 'unearth', 'vanishing', 'venture',
        'partner', 'eminence', 'landfall', 'metalcraft', 'threshold', 'revolt',
        'delirium', 'undergrowth', 'constellation', 'battalion', 'strive', 'sweep',
        'addendum', 'alliance', 'bloodrush', 'channel', 'chroma', 'cohort', 'converge',
        'domain', 'enrage', 'ferocious', 'formidable', 'grandeur', 'hellbent', 'heroic',
        'imprint', 'inspired', 'kinship', 'lieutenant', 'morbid', 'parley', 'radiance',
        'raid', 'rally', 'saddle',
        
        # Equipment & auras
        'equip', 'equipment', 'aura', 'enchant', 'attach', 'unattach', 'vehicle', 'treasure',
        'saga', 'curse', 'enchanted',
        
        # Counters
        'loyalty', 'charge', 'time', 'energy', 'experience', 'poison',
        'age', 'oil', 'shield', 'stun',
        
        # Colors
        'colorless', 'colored', ' red', 'green', 'blue', 'black', 'white',

        # Other
        'toughness', 'power', 'reanimate', 'infinite', 'combo', 'superfriends', 'spellslinger', 'ramp'
    }
    
    # Multi-word phrases
    exact_phrases = {
        # Evergreen abilities
        'first strike', 'double strike', 'split second', 'living weapon',
        'totem armor', 'basic landcycling', 'cumulative upkeep', 'level up',
        'plainscycling',
        
        # Complex triggers
        'enters the battlefield', 'leaves the battlefield', 'when this enters the battlefield',
        'whenever a creature enters the battlefield', 'when another creature enters the battlefield',
        'enters the battlefield tapped', 'enters the battlefield with',
        'enters the battlefield under your control',
        'deals combat damage', 'combat damage to a player', 'combat damage to an opponent',
        'when this creature attacks', 'whenever this creature attacks',
        'when this creature dies', 'whenever this creature dies',
        'when this creature becomes blocked', 'whenever this creature becomes blocked',
        'when this creature deals combat damage', 'whenever this creature deals combat damage',
        'at the beginning of your upkeep', 'at the beginning of each upkeep',
        'at the beginning of your end step', 'at the beginning of each end step',
        'at the beginning of combat', 'at the beginning of your draw step',
        'at the beginning of each player\'s upkeep', 'during each player\'s turn',
        'whenever you cast a spell', 'whenever you cast an instant or sorcery',
        'whenever you cast a creature spell', 'whenever you cast a noncreature spell',
        'whenever you cast your first spell', 'whenever you cast your second spell',
        'once each turn', 'during your turn', 'on your turn', 'their first', 'their second',
        'your first', 'your second',
        
        # Combat phrases from V2
        'attacking creature', 'blocking creature', 'deals combat damage', 'when attacks', 'when blocks',
        'whenever attacks', 'whenever blocks', 'attack alone', 'can attack', 'must attack',
        'cannot attack', 'can block', 'cannot block', 'unblocked creature', 'attacking or blocking',
        'deals damage to a creature', 'deals damage to a player', 'deals damage equal to its power',
        'can\'t be blocked', 'must be blocked', 'blocks or becomes blocked', 'first strike damage',
        'double strike damage', 'attacking causes', 'dying causes',
        
        # Counter types with +/-
        '+1/+1 counter', '-1/-1 counter', 'loyalty counter', 'charge counter',
        'time counter', 'energy counter', 'experience counter', 'poison counter',
        'age counter', 'oil counter', 'shield counter', 'stun counter',
        '+1/+1 counters', 'charge counters', 'loyalty counters', 'time counters',
        'age counters', 'poison counters', 'energy counters', 'experience counters',
        'oil counters', 'shield counters', 'stun counters', 'remove a counter',
        'place a counter', 'put a counter', 'with counters', 'counter on it',
        'counters on it', 'number of counters',
        
        # Card advantage
        'draw a card', 'draw cards', 'draw two cards', 'draw three cards', 'draw additional',
        'discard a card', 'discard cards', 'exile a card', 'exile cards',
        'return to hand', 'return from graveyard', 'search your library',
        'look at the top card', 'look at the top X cards',
        
        # Mana & costs
        'mana cost', 'mana value', 'converted mana cost', 'without paying',
        'costs less to cast', 'costs less', 'costs more', 'reduce the cost',
        'add mana', 'mana of any color', 'pay mana', 'spend mana',
        'colorless mana', 'colored mana', 'additional cost',
        
        # Token creation
        'create a token', 'create a creature token', 'create a treasure token',
        'create a food token', 'create a clue token', 'create a blood token',
        'creature token', 'artifact token', 'treasure token',
        'food token', 'clue token', 'blood token', 'gold token', 'token creature',
        
        # Card types
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'legendary sorcery', 'basic land', 'nonbasic land', 'instant or sorcery', 
        'noncreature spell', 'instant spell', 'sorcery spell', 'creature spell',
        'noncreature artifact', 'enchantment token', 'artifact token',
        
        # Protection & evasion
        'protection from', 'can\'t be targeted', 'can\'t be blocked',
        'can\'t be countered', 'can\'t be destroyed', 'prevent damage',
        'prevent all damage', 'redirect damage', 'damage prevention',
        
        # Win conditions
        'you win the game', 'target player loses the game', 'commander damage',
        'laboratory maniac', 'thassa\'s oracle', 'loses the game', 'wins the game',
        'can\'t lose the game', 'can\'t win the game', 'alternate win condition',
        'infinite combo', 'goes infinite', 'poison counters', 'approach of the second sun',
        
        # Commander specific
        'partner with', 'choose a background', 'command zone', 'commander tax',
        'choose a creature type', 'of the chosen type', 'shares a creature type',
        'tribal spell', 'background',
        
        # Multiplayer
        'each opponent', 'each player', 'target opponent', 'choose a player',
        'council\'s dilemma', 'tempting offer', 'will of the council', 'join forces',
        
        # Common phrases
        'you control', 'target creature', 'this creature', 'end of turn',
        'until end of turn', 'this turn', 'creature you control', 'this spell',
        'this card', 'cast this', 'when enters', 'when dies', 'when attacks',
        'when blocks', 'whenever', 'at end of', 'whenever you draw', 'whenever deals damage',
        'whenever takes damage', 'when you gain life', 'when you lose life',
        'whenever discards', 'whenever sacrifices', 'when enters', 'when this enters',
        'whenever enters', 'whenever this enters', 'when leaves', 'when this leaves',
        'whenever leaves', 'whenever this leaves', 'during each',
        'when you cast', 'whenever you cast', 'when you draw', 'whenever you draw',
        'when you gain life', 'whenever you gain life', 'when you lose life',
        'whenever you lose life', 'when becomes tapped', 'whenever becomes tapped',
        'when becomes untapped', 'whenever becomes untapped',
        
        # Zone manipulation
        'from graveyard', 'to graveyard', 'from exile', 'to exile',
        'from your hand', 'to your hand', 'from library', 'search library',
        'shuffle library', 'top of library', 'bottom of library', 'return to battlefield',
        'enters from graveyard', 'return from graveyard', 'cast from graveyard',
        'graveyard to battlefield', 'self-mill', 'graveyard matters',
        
        # Sacrifice and destruction
        'destroy target', 'sacrifice creature', 'sacrifice artifact',
        'sacrifice enchantment', 'sacrifice land', 'when sacrificed',
        'when destroyed', 'destroy all', 'sacrifice all',
        
        # Triggered abilities
        'whenever you cast', 'whenever you draw', 'whenever deals damage',
        'whenever takes damage', 'when you gain life', 'when you lose life',
        'whenever discards', 'whenever sacrifices',
        
        # Artifact/Equipment synergies
        'affinity for artifacts', 'artifact enters', 'artifact dies',
        'metalcraft', 'improvise', 'crew', 'attach', 
        
        # Enchantment synergies  
        'constellation', 'enchantment enters', 'bestow', 'totem armor',
        
        # Planeswalker synergies
        'loyalty', 'loyalty ability', 'activate loyalty abilities',
        'plus ability', 'minus ability', 'ultimate ability', 'doubling season',
        'spark double',
        
        # Land synergies
        'landfall', 'enters tapped', 'sacrifice a land', 'search for a land',
        'land enters', 'land drops', 'additional land', 'extra land',
        'mana dork', 'mana rock', 'land ramp',
        
        # Evasion
        'unblockable', 'must be blocked', 'shadow', 'fear', 'intimidate', 'skulk',
        
        # Activated abilities
        'tap:', 'untap:', 'put into graveyard'
    }
    
    # Tribal phrases (using "other" pattern)
    tribal_creatures = [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'rogue', 'cleric', 'shaman', 'treefolk', 'avatar', 'horror', 'eldrazi', 
        'phyrexian', 'sliver', 'ally', 'fish', 'spider', 'snake', 'dinosaur', 
        'giant', 'faerie'
    ]
    
    tribal_phrases = {f'other {creature}' for creature in tribal_creatures}
    
    return base_words, exact_phrases | tribal_phrases

def generate_word_variations(base_word: str) -> List[str]:
    """Generate common variations of a base word"""
    variations = [base_word]
    
    # Handle plurals
    if base_word.endswith('y'):
        variations.append(base_word[:-1] + 'ies')  # enemy -> enemies
    elif base_word.endswith(('s', 'sh', 'ch', 'x', 'z')):
        variations.append(base_word + 'es')  # glass -> glasses
    elif base_word.endswith('f'):
        variations.append(base_word[:-1] + 'ves')  # elf -> elves
    elif base_word.endswith('fe'):
        variations.append(base_word[:-2] + 'ves')  # knife -> knives
    else:
        variations.append(base_word + 's')  # card -> cards
    
    # Handle verb forms for action words
    action_words = {
        'attack': ['attacks', 'attacking', 'attacked'],
        'block': ['blocks', 'blocking', 'blocked'], 
        'cast': ['casts', 'casting'],
        'die': ['dies', 'died', 'dying'],
        'enter': ['enters', 'entering', 'entered'],
        'leave': ['leaves', 'leaving', 'left'],
        'sacrifice': ['sacrifices', 'sacrificed', 'sacrificing'],
        'destroy': ['destroys', 'destroyed', 'destroying'],
        'draw': ['draws', 'drawing', 'drew'],
        'discard': ['discards', 'discarded', 'discarding'],
        'tap': ['taps', 'tapped', 'tapping'],
        'untap': ['untaps', 'untapped', 'untapping'],
        'deal': ['deals', 'dealing', 'dealt'],
        'take': ['takes', 'taking', 'took'],
        'gain': ['gains', 'gained', 'gaining'],
        'lose': ['loses', 'lost', 'losing'],
        'look': ['looks', 'looking', 'looked'],
        'reveal': ['reveals', 'revealing', 'revealed'],
        'search': ['searches', 'searching', 'searched'],
        'shuffle': ['shuffles', 'shuffling', 'shuffled'],
        'choose': ['chooses', 'choosing', 'chose'],
        'prevent': ['prevents', 'preventing', 'prevented'],
        'redirect': ['redirects', 'redirecting', 'redirected']
    }
    
    if base_word in action_words:
        variations.extend(action_words[base_word])
    
    return variations

def get_keyword_arrays():
    """
    Generate all keywords and return them as organized arrays
    """
    base_words, exact_phrases = create_keyword_system()
    
    # Convert base words to list and generate all variations
    all_single_words = []
    for word in base_words:
        variations = generate_word_variations(word)
        all_single_words.extend(variations)
    
    # Convert exact phrases to list
    all_phrases = list(exact_phrases)
    
    # Combine everything into one master list
    all_keywords = all_single_words + all_phrases
    
    # Remove duplicates and sort
    all_keywords = sorted(list(set(all_keywords)))
    all_single_words = sorted(list(set(all_single_words)))
    all_phrases = sorted(all_phrases)
    
    return {
        'all_keywords': all_keywords,           # Everything combined
        'single_words': all_single_words,       # Just single words with variations
        'phrases': all_phrases,                 # Just multi-word phrases
        'base_words': sorted(list(base_words))  # Original base words only
    }

In [26]:
print("All keywords:", get_keyword_arrays()['all_keywords'])

All keywords: [' red', ' reds', '+1/+1 counter', '+1/+1 counters', '-1/-1 counter', 'absorb', 'absorbs', 'activate loyalty abilities', 'adapt', 'adapts', 'add mana', 'addendum', 'addendums', 'additional cost', 'additional land', 'affinities', 'affinity', 'affinity for artifacts', 'age', 'age counter', 'age counters', 'ages', 'alliance', 'alliances', 'allies', 'ally', 'alternate win condition', 'amplifies', 'amplify', 'angel', 'angels', 'annihilator', 'annihilators', 'approach of the second sun', 'artifact', 'artifact creature', 'artifact dies', 'artifact enters', 'artifact token', 'artifacts', 'ascend', 'ascends', 'assist', 'assists', 'at end of', 'at the beginning of combat', 'at the beginning of each end step', "at the beginning of each player's upkeep", 'at the beginning of each upkeep', 'at the beginning of your draw step', 'at the beginning of your end step', 'at the beginning of your upkeep', 'attach', 'attaches', 'attack', 'attack alone', 'attacked', 'attacking', 'attacking caus

In [27]:
def create_keyword_features(df, keywords):
    """Create keyword features with list"""
    keyword_matrix = []
    
    print(f"Creating features with {len(keywords)} keywords...")
    
    for idx, row in df.iterrows():
        if idx % 5000 == 0:
            print(f"  Processing card {idx}/{len(df)}")
            
        text = row['combined_text'].lower()
        features = []
        
        for keyword in keywords:
            # Count occurrences (cap at 3)
            count = text.count(keyword.lower())
            features.append(min(count, 3))
            
        keyword_matrix.append(features)
    
    return np.array(keyword_matrix), keywords

# Create keyword matrix
print("Creating keyword matrix...")
keyword_matrix, keyword_list = create_keyword_features(df_clean, get_keyword_arrays()['all_keywords'])
print(f"Matrix shape: {keyword_matrix.shape}")

# Test with Isshin
test_commanders = ['Isshin']

for commander_name in test_commanders:
    matches = df_clean[df_clean['name'].str.contains(commander_name, case=False, na=False)]
    if len(matches) > 0:
        idx = matches.index[0]
        features = keyword_matrix[idx]
        matched_keywords = [keyword_list[i] for i in range(len(keyword_list)) if features[i] > 0]
        
        print(f"\n⚔️ {matches.iloc[0]['name']}:")
        print(f"   Matched keywords ({len(matched_keywords)}): {matched_keywords[:15]}...")
        print(f"   Total feature count: {sum(features)}")
    else:
        print(f"\n❌ {commander_name} not found")

Creating keyword matrix...
Creating features with 941 keywords...
  Processing card 0/25790
  Processing card 5000/25790
  Processing card 10000/25790
  Processing card 15000/25790
  Processing card 20000/25790
  Processing card 25000/25790
Matrix shape: (25790, 941)

⚔️ Isshin, Two Heavens as One:
   Matched keywords (9): ['attack', 'attacking', 'attacking causes', 'creature', 'human', 'legendary', 'legendary creature', 'time', 'you control']...
   Total feature count: 10


In [28]:
# Save the model
import pickle

print("Saving model...")

# Prepare model data
model_data = {
    'df_clean': df_clean,
    'keyword_matrix': keyword_matrix,
    'keywords': keyword_list,
    'model_version': '20250611',
    'total_keywords': len(keyword_list),
    'total_cards': len(df_clean)
}

# Save to file
with open('data/mtg_match_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print(f"✅ Model saved!")
print(f"   - {len(df_clean)} cards")
print(f"   - {len(keyword_list)} keywords")
print(f"   - Matrix size: {keyword_matrix.shape}")

# Save keyword list as text file for reference
with open('data/keyword_list.txt', 'w') as f:
    for i, keyword in enumerate(keyword_list):
        f.write(f"{i+1:3d}. {keyword}\n")

print("✅ Keyword list saved to keyword_list.txt")

Saving model...
✅ Model saved!
   - 25790 cards
   - 941 keywords
   - Matrix size: (25790, 941)
✅ Keyword list saved to keyword_list.txt
