In [26]:
# Import all our libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf

print("All libraries imported successfully!")

dtype_dict = {
    'loyalty': str,
    'attraction_lights': str,
    'flavor_name': str,
    'printed_name': str,
    'printed_type_line': str,
    'printed_text': str,
}
df_clean = pd.read_csv('data/mtg_cards_clean.csv', dtype=dtype_dict)
print(f"Loaded {len(df_clean)} cards for ML model")

All libraries imported successfully!
Loaded 27623 cards for ML model


In [28]:
# Create TF-IDF vectors from card text
print("Creating TF-IDF vectors...")
tfidf = TfidfVectorizer(
    max_features=5000,      # Top 5000 most important words
    stop_words='english',   # Remove common words like 'the', 'and'
    ngram_range=(1, 2),     # Use single words and word pairs
    min_df=5                # Word must appear in at least 2 cards
)

# Transform card text into numerical vectors
tfidf_matrix = tfidf.fit_transform(df_clean['combined_text'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Each card is now a vector of {tfidf_matrix.shape[1]} numbers!")

# Let's see what words the model found most important
feature_names = tfidf.get_feature_names_out()
print(f"\nSample important words: {feature_names[:100]}")

Creating TF-IDF vectors...
TF-IDF matrix shape: (27623, 5000)
Each card is now a vector of 5000 numbers!

Sample important words: ['10' '10 10' '10 19' '10 damage' '10 life' '12' '13' '13 life' '19' '20'
 'abilities' 'abilities activate' 'abilities activated' 'abilities aren'
 'abilities artifacts' 'abilities creature' 'abilities creatures'
 'abilities enchantment' 'abilities opponents' 'abilities pays'
 'abilities targeted' 'abilities types' 'ability' 'ability artifact'
 'ability choose' 'ability control' 'ability costs' 'ability creature'
 'ability end' 'ability legendary' 'ability opponent' 'ability permanent'
 'ability resolved' 'ability sacrifice' 'ability triggers' 'able'
 'able attack' 'able attacks' 'able block' 'able creature'
 'able enchantment' 'able gain' 'able legendary' 'able sorcery' 'activate'
 'activate abilities' 'activate ability' 'activate control'
 'activate creature' 'activate exhaust' 'activate loyalty'
 'activate opponent' 'activate seven' 'activate sorcery' 'ac

In [30]:
# def find_similar_cards(card_name, top_n=10):
#     """
#     Find cards similar to the input card name
#     """
#     # Find the card in our dataset
#     card_matches = df_clean[df_clean['name'].str.contains(card_name, case=False, na=False)]
    
#     if len(card_matches) == 0:
#         return f"Card '{card_name}' not found in dataset"
    
#     # Get the first match
#     card_idx = card_matches.index[0]
#     card_vector = tfidf_matrix[card_idx]
    
#     # Calculate similarity with all other cards
#     similarities = cosine_similarity(card_vector, tfidf_matrix).flatten()
    
#     # Get top similar cards (excluding the card itself)
#     similar_indices = similarities.argsort()[-top_n-1:-1][::-1]
    
#     # Return results
#     results = []
#     for idx in similar_indices:
#         if idx != card_idx:  # Don't include the original card
#             results.append({
#                 'name': df_clean.iloc[idx]['name'],
#                 'similarity': similarities[idx],
#                 'type': df_clean.iloc[idx]['type_line'],
#                 'text': df_clean.iloc[idx]['oracle_text'][:100] + "..."
#             })
    
#     return results[:top_n]

# # Test the recommendation system!
# print("Testing with MacCready, Lamplight Mayor...")
# recommendations = find_similar_cards("MacCready, Lamplight Mayor")
# for i, card in enumerate(recommendations):
#     print(f"{i+1}. {card['name']} (similarity: {card['similarity']:.3f})")
#     print(f"   Type: {card['type']}")
#     print(f"   Text: {card['text']}")
#     print()

Testing with MacCready, Lamplight Mayor...
1. Gregor, Shrewd Magistrate (similarity: 0.504)
   Type: Legendary Creature — Human Advisor
   Text: Skulk (This creature can't be blocked by creatures with greater power.)
Whenever Gregor, Shrewd Magi...

2. Furtive Homunculus (similarity: 0.462)
   Type: Creature — Homunculus
   Text: Skulk (This creature can't be blocked by creatures with greater power.)...

3. Behind the Scenes (similarity: 0.450)
   Type: Enchantment
   Text: Creatures you control have skulk. (They can't be blocked by creatures with greater power.)
{4}{W}: C...

4. Pale Rider of Trostad (similarity: 0.449)
   Type: Creature — Spirit
   Text: Skulk (This creature can't be blocked by creatures with greater power.)
When this creature enters, d...

5. The Master, Mesmerist (similarity: 0.413)
   Type: Legendary Creature — Time Lord Rogue
   Text: {T}: Target creature an opponent controls with power less than or equal to The Master's power gains ...

6. Aysen Bureaucrats (sim

In [4]:
# # Let's see what the model thinks is "similar" about Atraxa
# atraxa_idx = df_clean[df_clean['name'].str.contains('Atraxa', case=False)].index[0]
# atraxa_text = df_clean.iloc[atraxa_idx]['combined_text']
# print("Atraxa's combined text:")
# print(atraxa_text)
# print("\n" + "="*50)

# # Let's see what Broken Wings text looks like
# broken_wings_idx = df_clean[df_clean['name'].str.contains('Broken Wings', case=False)].index[0]
# broken_wings_text = df_clean.iloc[broken_wings_idx]['combined_text']
# print("Broken Wings combined text:")
# print(broken_wings_text)

Atraxa's combined text:
Destroy target artifact, battle, enchantment, or creature with flying. Sorcery []

Broken Wings combined text:
Destroy target artifact, enchantment, or creature with flying. Instant []


In [31]:
# Let's properly find MacCready, Lamplight Mayor in our dataset
print("Searching for MacCready, Lamplight Mayor in our dataset...")
maccready_matches = df_clean[df_clean['name'].str.contains('MacCready, Lamplight Mayor', case=False, na=False)]
print(f"Found {len(maccready_matches)} matches:")
print(maccready_matches[['name', 'type_line']].head())

if len(maccready_matches) > 0:
    # Get the actual MacCready, Lamplight Mayor
    maccready_row = maccready_matches.iloc[0]
    print(f"\nActual MacCready, Lamplight Mayor data:")
    print(f"Name: {maccready_row['name']}")
    print(f"Type: {maccready_row['type_line']}")
    print(f"Oracle text: {maccready_row['oracle_text']}")
    print(f"Combined text: {maccready_row['combined_text']}")
else:
    print("MacCready, Lamplight Mayor not found! Let's see what commanders we do have...")
    commanders = df_clean[df_clean['type_line'].str.contains('Legendary', case=False, na=False)]
    print(f"Sample commanders in dataset:")
    print(commanders[['name', 'type_line']].head(10))

Searching for MacCready, Lamplight Mayor in our dataset...
Found 1 matches:
                             name                           type_line
13984  MacCready, Lamplight Mayor  Legendary Creature — Human Advisor

Actual MacCready, Lamplight Mayor data:
Name: MacCready, Lamplight Mayor
Type: Legendary Creature — Human Advisor
Oracle text: Whenever a creature you control with power 2 or less attacks, it gains skulk until end of turn. (It can't be blocked by creatures with greater power.)
Whenever a creature with power 4 or greater attacks you, its controller loses 2 life and you gain 2 life.
Combined text: Whenever a creature you control with power 2 or less attacks, it gains skulk until end of turn. (It can't be blocked by creatures with greater power.)
Whenever a creature with power 4 or greater attacks you, its controller loses 2 life and you gain 2 life. Legendary Creature — Human Advisor []


In [32]:
# Let's search for MacCready, Lamplight Mayor in our dataset
print("Searching for MacCready, Lamplight Mayor in our dataset...")
maccready_matches = df_clean[df_clean['name'].str.contains('MacCready, Lamplight Mayor', case=False, na=False)]
print(f"Found {len(arcades_matches)} matches:")

if len(maccready_matches) > 0:
    print(maccready_matches[['name', 'type_line']].head())
    
    # Get the actual MacCready, Lamplight Mayor data
    maccready_row = maccready_matches.iloc[0]
    print(f"\nActual MacCready, Lamplight Mayor data:")
    print(f"Name: {maccready_row['name']}")
    print(f"Type: {maccready_row['type_line']}")
    print(f"Oracle text: {maccready_row['oracle_text']}")
    print(f"Combined text: {maccready_row['combined_text'][:200]}...")
    
    # Test our recommendation function with MacCready, Lamplight Mayor
    print("\n" + "="*50)
    print("Testing recommendations for MacCready, Lamplight Mayor...")
    recommendations = find_similar_cards("MacCready, Lamplight Mayor")
    for i, card in enumerate(recommendations):
        print(f"{i+1}. {card['name']} (similarity: {card['similarity']:.3f})")
        print(f"   Type: {card['type']}")
        print(f"   Text: {card['text']}")
        print()
        
else:
    print("MacCready, Lamplight Mayor not found either! Let's see what cards we have:")
    # Show some sample card names
    sample_cards = df_clean['name'].head(20)
    print("Sample cards in dataset:")
    for card in sample_cards:
        print(f"- {card}")

Searching for MacCready, Lamplight Mayor in our dataset...
Found 1 matches:
                             name                           type_line
13984  MacCready, Lamplight Mayor  Legendary Creature — Human Advisor

Actual MacCready, Lamplight Mayor data:
Name: MacCready, Lamplight Mayor
Type: Legendary Creature — Human Advisor
Oracle text: Whenever a creature you control with power 2 or less attacks, it gains skulk until end of turn. (It can't be blocked by creatures with greater power.)
Whenever a creature with power 4 or greater attacks you, its controller loses 2 life and you gain 2 life.
Combined text: Whenever a creature you control with power 2 or less attacks, it gains skulk until end of turn. (It can't be blocked by creatures with greater power.)
Whenever a creature with power 4 or greater attac...

Testing recommendations for MacCready, Lamplight Mayor...
1. Gregor, Shrewd Magistrate (similarity: 0.504)
   Type: Legendary Creature — Human Advisor
   Text: Skulk (This creatur

In [33]:
# Let's see what words the TF-IDF model thinks are most important
def analyze_tfidf_features():
    # Get feature names and their importance
    feature_names = tfidf.get_feature_names_out()
    
    # For MacCready, Lamplight Mayor, let's see which features have the highest weights
    maccready_idx = df_clean[df_clean['name'].str.contains('MacCready, Lamplight Mayor', case=False)].index[0]
    maccready_vector = tfidf_matrix[maccready_idx].toarray()[0]
    
    # Get top features for MacCready, Lamplight Mayor
    top_indices = maccready_vector.argsort()[-20:][::-1]
    print("Top 20 TF-IDF features for MacCready, Lamplight Mayor:")
    for idx in top_indices:
        if maccready_vector[idx] > 0:
            print(f"{feature_names[idx]}: {maccready_vector[idx]:.3f}")

analyze_tfidf_features()

Top 20 TF-IDF features for MacCready, Lamplight Mayor:
power: 0.323
greater: 0.285
attacks gains: 0.237
skulk: 0.234
creatures greater: 0.231
attacks: 0.217
controller loses: 0.204
human advisor: 0.202
greater power: 0.201
control power: 0.198
life legendary: 0.192
life: 0.186
turn blocked: 0.184
advisor: 0.183
power creature: 0.171
life gain: 0.162
power greater: 0.161
blocked creatures: 0.149
creature power: 0.147
loses life: 0.135


In [36]:
# Let's try a completely different approach focused on MTG mechanics
def create_keyword_features(df):
    """Create features based on important MTG keywords and mechanics"""
    
    # Important MTG mechanics for synergy
    important_keywords = [
        'defender', 'flying', 'vigilance', 'deathtouch', 'lifelink', 'trample',
        'haste', 'first strike', 'double strike', 'hexproof', 'indestructible',
        'proliferate', 'counter', 'artifact', 'enchantment', 'token', 'draw',
        'graveyard', 'exile', 'sacrifice', 'destroy', 'search', 'toughness',
        'power', 'enters', 'whenever', 'combat damage', 'attacks', 'skulk', 'lose', 'gain', 'life'
    ]
    
    # Create binary features for each keyword
    keyword_matrix = []
    
    for _, row in df.iterrows():
        text = row['combined_text'].lower()
        features = []
        
        for keyword in important_keywords:
            # Count how many times this keyword appears
            count = text.count(keyword)
            features.append(min(count, 3))  # Cap at 3 to avoid over-weighting
            
        keyword_matrix.append(features)
    
    return np.array(keyword_matrix), important_keywords

# Create keyword-based features
keyword_matrix, keywords = create_keyword_features(df_clean)
print(f"Keyword matrix shape: {keyword_matrix.shape}")
print(f"Keywords: {keywords}")

# Test with MacCready, Lamplight Mayor
maccready_idx = df_clean[df_clean['name'].str.contains('MacCready, Lamplight Mayor', case=False)].index[0]
maccready_features = keyword_matrix[maccready_idx]
print(f"\nMacCready, Lamplight Mayor keyword features:")
for i, keyword in enumerate(keywords):
    if maccready_features[i] > 0:
        print(f"{keyword}: {maccready_features[i]}")

Keyword matrix shape: (27623, 32)
Keywords: ['defender', 'flying', 'vigilance', 'deathtouch', 'lifelink', 'trample', 'haste', 'first strike', 'double strike', 'hexproof', 'indestructible', 'proliferate', 'counter', 'artifact', 'enchantment', 'token', 'draw', 'graveyard', 'exile', 'sacrifice', 'destroy', 'search', 'toughness', 'power', 'enters', 'whenever', 'combat damage', 'attacks', 'skulk', 'lose', 'gain', 'life']

MacCready, Lamplight Mayor keyword features:
power: 3
whenever: 2
attacks: 2
skulk: 1
lose: 1
gain: 2
life: 2


In [37]:
def find_similar_cards_keywords(card_name, top_n=10):
    """
    Find cards similar to the input card using keyword-based features
    """
    # Find the card in our dataset
    card_matches = df_clean[df_clean['name'].str.contains(card_name, case=False, na=False)]
    
    if len(card_matches) == 0:
        return f"Card '{card_name}' not found in dataset"
    
    # Get the card's keyword features
    card_idx = card_matches.index[0]
    card_vector = keyword_matrix[card_idx].reshape(1, -1)
    
    # Calculate similarity with all other cards using keyword features
    similarities = cosine_similarity(card_vector, keyword_matrix).flatten()
    
    # Get top similar cards (excluding the card itself)
    similar_indices = similarities.argsort()[-top_n-1:-1][::-1]
    
    # Return results
    results = []
    for idx in similar_indices:
        if idx != card_idx:  # Don't include the original card
            results.append({
                'name': df_clean.iloc[idx]['name'],
                'similarity': similarities[idx],
                'type': df_clean.iloc[idx]['type_line'],
                'text': df_clean.iloc[idx]['oracle_text'][:150] + "...",
                'keywords': [keywords[i] for i in range(len(keywords)) if keyword_matrix[idx][i] > 0]
            })
    
    return results[:top_n]

# Test the improved recommendation system!
print("Testing improved recommendations for MacCready, Lamplight Mayor...")
recommendations = find_similar_cards_keywords("MacCready, Lamplight Mayor")

for i, card in enumerate(recommendations):
    print(f"{i+1}. {card['name']} (similarity: {card['similarity']:.3f})")
    print(f"   Type: {card['type']}")
    print(f"   Shared keywords: {card['keywords']}")
    print(f"   Text: {card['text']}")
    print()

Testing improved recommendations for MacCready, Lamplight Mayor...
1. Quilled Charger (similarity: 0.866)
   Type: Creature — Porcupine Mount
   Shared keywords: ['power', 'whenever', 'attacks', 'gain']
   Text: Whenever this creature attacks while saddled, it gets +1/+2 and gains menace until end of turn. (It can't be blocked except by two or more creatures.)...

2. Mycoid Shepherd (similarity: 0.866)
   Type: Creature — Fungus
   Shared keywords: ['power', 'whenever', 'gain', 'life']
   Text: Whenever this creature or another creature you control with power 5 or greater dies, you may gain 5 life....

3. Courageous Goblin (similarity: 0.866)
   Type: Creature — Goblin
   Shared keywords: ['power', 'whenever', 'attacks', 'gain']
   Text: Whenever this creature attacks while you control a creature with power 4 or greater, this creature gets +1/+0 and gains menace until end of turn. (It ...

4. Raubahn, Bull of Ala Mhigo (similarity: 0.866)
   Type: Legendary Creature — Human Warrior
   

In [38]:
# # First, let's check Arcades' color identity
# arcades_row = df_clean[df_clean['name'].str.contains('Arcades, the Strategist', case=False)].iloc[0]
# print(f"Arcades colors: {arcades_row['colors']}")

# def get_color_identity(colors_list):
#     """Convert color list to set for easier comparison"""
#     if pd.isna(colors_list):
#         return set()
#     # Handle string representation of list
#     if isinstance(colors_list, str):
#         import ast
#         try:
#             colors_list = ast.literal_eval(colors_list)
#         except:
#             return set()
#     return set(colors_list) if colors_list else set()

# def is_legal_in_deck(card_colors, commander_colors):
#     """Check if card is legal in commander's color identity"""
#     card_identity = get_color_identity(card_colors)
#     commander_identity = get_color_identity(commander_colors)
    
#     # Card is legal if all its colors are in commander's identity
#     return card_identity.issubset(commander_identity)

# # Test this function
# arcades_colors = arcades_row['colors']
# test_cards = [
#     (['W', 'U'], "Should be legal"),
#     (['W', 'U', 'G'], "Should be legal"), 
#     (['R'], "Should be illegal"),
#     (['B', 'G'], "Should be illegal"),
#     ([], "Colorless - should be legal")
# ]

# print(f"Arcades color identity: {arcades_colors}")
# for colors, description in test_cards:
#     legal = is_legal_in_deck(colors, arcades_colors)
#     print(f"{colors} - {description}: {'✅ Legal' if legal else '❌ Illegal'}")

Arcades colors: ['G', 'U', 'W']
Arcades color identity: ['G', 'U', 'W']


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [39]:
# First, let's examine how colors are stored in our data
arcades_row = df_clean[df_clean['name'].str.contains('Arcades, the Strategist', case=False)].iloc[0]
print(f"Arcades colors (raw): {repr(arcades_row['colors'])}")
print(f"Type of colors field: {type(arcades_row['colors'])}")

# Let's look at a few more examples
print("\nSample color data from other cards:")
for i in range(5):
    card = df_clean.iloc[i]
    print(f"{card['name']}: {repr(card['colors'])} (type: {type(card['colors'])})")

Arcades colors (raw): "['G', 'U', 'W']"
Type of colors field: <class 'str'>

Sample color data from other cards:
+2 Mace: "['W']" (type: <class 'str'>)
Aarakocra Sneak: "['U']" (type: <class 'str'>)
Aatchik, Emerald Radian: "['B', 'G']" (type: <class 'str'>)
Abaddon the Despoiler: "['B', 'R', 'U']" (type: <class 'str'>)
Abandoned Campground: '[]' (type: <class 'str'>)


In [40]:
import ast

def get_color_identity(colors_string):
    """Convert color string to set for easier comparison"""
    if pd.isna(colors_string) or colors_string == '[]':
        return set()
    
    try:
        # Convert string representation of list to actual list
        colors_list = ast.literal_eval(colors_string)
        return set(colors_list) if colors_list else set()
    except:
        return set()

def is_legal_in_deck(card_colors_string, commander_colors_string):
    """Check if card is legal in commander's color identity"""
    card_identity = get_color_identity(card_colors_string)
    commander_identity = get_color_identity(commander_colors_string)
    
    # Card is legal if all its colors are in commander's identity
    return card_identity.issubset(commander_identity)

# Test this function
arcades_colors = arcades_row['colors']
test_cases = [
    ("['W', 'U']", "White/Blue - Should be legal"),
    ("['W', 'U', 'G']", "Bant - Should be legal"), 
    ("['R']", "Red - Should be illegal"),
    ("['B', 'G']", "Black/Green - Should be illegal"),
    ("[]", "Colorless - Should be legal"),
    ("['W']", "Mono-white - Should be legal")
]

print(f"Arcades color identity: {get_color_identity(arcades_colors)}")
print("\nTesting color legality:")
for colors, description in test_cases:
    legal = is_legal_in_deck(colors, arcades_colors)
    print(f"{description}: {'✅ Legal' if legal else '❌ Illegal'}")

Arcades color identity: {'U', 'G', 'W'}

Testing color legality:
White/Blue - Should be legal: ✅ Legal
Bant - Should be legal: ✅ Legal
Red - Should be illegal: ❌ Illegal
Black/Green - Should be illegal: ❌ Illegal
Colorless - Should be legal: ✅ Legal
Mono-white - Should be legal: ✅ Legal


In [42]:
def find_similar_cards_with_colors(card_name, top_n=10):
    """
    Find cards similar to the input card using keyword-based features
    AND filter by color identity legality
    """
    # Find the card in our dataset
    card_matches = df_clean[df_clean['name'].str.contains(card_name, case=False, na=False)]
    
    if len(card_matches) == 0:
        return f"Card '{card_name}' not found in dataset"
    
    # Get the commander's info
    commander_row = card_matches.iloc[0]
    commander_idx = card_matches.index[0]
    commander_colors = commander_row['colors']
    commander_vector = keyword_matrix[commander_idx].reshape(1, -1)
    
    print(f"Commander: {commander_row['name']}")
    print(f"Color Identity: {get_color_identity(commander_colors)}")
    print("-" * 50)
    
    # Calculate similarity with all other cards
    similarities = cosine_similarity(commander_vector, keyword_matrix).flatten()
    
    # Get ALL cards sorted by similarity (we'll filter as we go)
    all_indices = similarities.argsort()[::-1]
    
    # Filter for color-legal cards and collect results
    results = []
    for idx in all_indices:
        if idx == commander_idx:  # Skip the commander itself
            continue
            
        card_row = df_clean.iloc[idx]
        card_colors = card_row['colors']
        
        # Check if card is legal in commander's color identity
        if is_legal_in_deck(card_colors, commander_colors):
            results.append({
                'name': card_row['name'],
                'similarity': similarities[idx],
                'type': card_row['type_line'],
                'colors': get_color_identity(card_colors),
                'text': card_row['oracle_text'][:500] + "...",
                'keywords': [keywords[i] for i in range(len(keywords)) if keyword_matrix[idx][i] > 0]
            })
            
            # Stop when we have enough recommendations
            if len(results) >= top_n:
                break
    
    return results

# Test the color-filtered recommendations!
print("Testing color-filtered recommendations for MacCready, Lamplight Mayor...")
recommendations = find_similar_cards_with_colors("MacCready, Lamplight Mayor", 25)

for i, card in enumerate(recommendations):
    print(f"{i+1}. {card['name']} (similarity: {card['similarity']:.3f})")
    print(f"   Colors: {card['colors']}")
    print(f"   Type: {card['type']}")
    print(f"   Shared keywords: {card['keywords']}")
    print(f"   Text: {card['text']}")
    print()

Testing color-filtered recommendations for MacCready, Lamplight Mayor...
Commander: MacCready, Lamplight Mayor
Color Identity: {'W', 'B'}
--------------------------------------------------
1. Unhinged Beast Hunt (similarity: 0.864)
   Colors: set()
   Type: Stickers
   Shared keywords: ['toughness', 'power', 'whenever', 'attacks', 'gain', 'life']
   Text: {TK}{TK} — {T}: You gain 1 life.
{TK}{TK}{TK}{TK} — Whenever this creature attacks, tap each creature an opponent controls with the same power and/or same toughness as this creature.
{TK}{TK} — 4/1
{TK}{TK}{TK} — 2/6...

2. Bounding Felidar (similarity: 0.864)
   Colors: {'W'}
   Type: Creature — Cat Beast Mount
   Shared keywords: ['counter', 'power', 'whenever', 'attacks', 'gain', 'life']
   Text: Whenever this creature attacks while saddled, put a +1/+1 counter on each other creature you control. You gain 1 life for each of those creatures.
Saddle 2 (Tap any number of other creatures you control with total power 2 or more: This Mou

In [43]:
# Let's test our system with a few different commander strategies
test_commanders = [
    "MacCready, Lamplight Mayor",
    "Gisa, the Hellraiser",
    "Go-Shintai of Life's Origin"
]

for commander in test_commanders:
    print(f"🔍 Testing recommendations for '{commander}'...")
    try:
        recs = find_similar_cards_with_colors(commander, 5)
        if isinstance(recs, str):  # Error message
            print(f"   {recs}")
        else:
            print(f"   Found {len(recs)} legal recommendations")
            if len(recs) > 0:
                print(f"   Top recommendation: {recs[0]['name']} (similarity: {recs[0]['similarity']:.3f})")
    except Exception as e:
        print(f"   Error: {e}")
    print()

🔍 Testing recommendations for 'MacCready, Lamplight Mayor'...
Commander: MacCready, Lamplight Mayor
Color Identity: {'W', 'B'}
--------------------------------------------------
   Found 5 legal recommendations
   Top recommendation: Unhinged Beast Hunt (similarity: 0.864)

🔍 Testing recommendations for 'Gisa, the Hellraiser'...
Commander: Gisa, the Hellraiser
Color Identity: {'B'}
--------------------------------------------------
   Found 5 legal recommendations
   Top recommendation: Tormod, the Desecrator (similarity: 0.866)

🔍 Testing recommendations for 'Go-Shintai of Life's Origin'...
Commander: Go-Shintai of Life's Origin
Color Identity: {'G'}
--------------------------------------------------
   Found 5 legal recommendations
   Top recommendation: Squirrel Sanctuary (similarity: 0.939)



In [44]:
def evaluate_recommendations(commander_name, expected_themes=None):
    """
    Evaluate the quality of recommendations for a given commander
    """
    recs = find_similar_cards_with_colors(commander_name, 10)
    
    if isinstance(recs, str):
        return f"Error: {recs}"
    
    print(f"📊 Evaluation for {commander_name}:")
    print(f"Number of recommendations: {len(recs)}")
    
    if len(recs) > 0:
        avg_similarity = sum(r['similarity'] for r in recs) / len(recs)
        print(f"Average similarity: {avg_similarity:.3f}")
        
        # Check diversity of card types
        card_types = set()
        for rec in recs:
            card_types.add(rec['type'].split(' — ')[0].split()[0])  # Get first word of type
        print(f"Card type diversity: {len(card_types)} different types")
        print(f"Types found: {', '.join(card_types)}")
        
        if expected_themes:
            theme_matches = 0
            for rec in recs:
                for theme in expected_themes:
                    if theme.lower() in ' '.join(rec['keywords']).lower():
                        theme_matches += 1
                        break
            print(f"Theme relevance: {theme_matches}/{len(recs)} cards match expected themes")
    
    return recs

# Test with Arcades and expected themes
arcades_themes = ['defender', 'toughness', 'draw']
evaluate_recommendations("Arcades, the Strategist", arcades_themes)

Commander: Arcades, the Strategist
Color Identity: {'U', 'G', 'W'}
--------------------------------------------------
📊 Evaluation for Arcades, the Strategist:
Number of recommendations: 10
Average similarity: 0.777
Card type diversity: 1 different types
Types found: Creature
Theme relevance: 10/10 cards match expected themes


[{'name': 'Corrupted Shapeshifter',
  'similarity': 0.8340576562282991,
  'type': 'Creature — Eldrazi Shapeshifter',
  'colors': set(),
  'text': 'Devoid (This card has no color.)\nAs this creature enters, it becomes your choice of a 3/3 creature with flying, a 2/5 creature with vigilance, or a 0/12 creature with defender....',
  'keywords': ['defender', 'flying', 'vigilance', 'enters']},
 {'name': 'Orator of Ojutai',
  'similarity': 0.7912565680749445,
  'type': 'Creature — Bird Monk',
  'colors': {'W'},
  'text': 'As an additional cost to cast this spell, you may reveal a Dragon card from your hand.\nDefender, flying\nWhen this creature enters, if you revealed a Dragon card or controlled a Dragon as you cast this spell, draw a card....',
  'keywords': ['defender', 'flying', 'draw', 'enters']},
 {'name': 'Flumph',
  'similarity': 0.7912565680749444,
  'type': 'Creature — Jellyfish',
  'colors': {'W'},
  'text': 'Defender, flying\nWhenever this creature is dealt damage, you and target 

In [1]:
# Load our data and analyze keyword coverage
import pandas as pd
import numpy as np
from collections import Counter
import re

# Load the data
df_clean = pd.read_csv('data/mtg_cards_clean.csv', low_memory=False)

# Current keyword list
current_keywords = [
        'defender', 'flying', 'vigilance', 'deathtouch', 'lifelink', 'trample',
        'haste', 'first strike', 'double strike', 'hexproof', 'indestructible',
        'proliferate', 'counter', 'artifact', 'enchantment', 'token', 'draw',
        'graveyard', 'exile', 'sacrifice', 'destroy', 'search', 'toughness',
        'power', 'enters', 'whenever', 'combat damage', 'attacks', 'skulk', 'lose', 'gain', 'life'
    ]

print(f"Current keywords: {len(current_keywords)}")
print("Current list:", current_keywords)

FileNotFoundError: [Errno 2] No such file or directory: 'data/mtg_cards_clean.csv'

In [2]:
import os

# Check current working directory
print("Current directory:", os.getcwd())

# List files in current directory
print("\nFiles in current directory:")
for item in os.listdir('.'):
    print(f"  {item}")

# Check if data folder exists
if os.path.exists('data'):
    print("\nFiles in data folder:")
    for item in os.listdir('data'):
        print(f"  data/{item}")
else:
    print("\nNo 'data' folder found in current directory")

Current directory: /Users/ryanglover/mtg-commander-recommender

Files in current directory:
  01_data_exploration.ipynb
  .DS_Store
  requirements.txt
  mtg-env
  02_ml_model_building.ipynb
  README.md
  .gitignore
  .gitattributes
  app.py
  templates
  03_web_interface.ipynb
  .ipynb_checkpoints
  .git
  data
  notebooks
  mtg-env-311
  src

Files in data folder:
  data/.DS_Store
  data/mtg_model.pkl
  data/mtg_cards_raw.csv


In [4]:
import pandas as pd

# Load raw data and examine all columns
df = pd.read_csv('data/mtg_cards_raw.csv', low_memory=False)

print(f"Dataset has {len(df.columns)} total columns:")
print("="*60)

# Show all columns with sample data
for i, col in enumerate(df.columns):
    sample_values = df[col].dropna().head(3).tolist()
    print(f"{i+1:2d}. {col:25} | Sample: {sample_values}")

print("\n" + "="*60)
print(f"Total cards: {len(df)}")

Dataset has 82 total columns:
 1. object                    | Sample: ['card', 'card', 'card']
 2. id                        | Sample: ['e882c9f9-bf30-46b6-bedc-379d2c80e5cb', '2a83882c-3e03-4e85-aaac-97fa1d08a772', 'fbdaa29b-85ff-4a06-b27e-fcdbdfd4a3fe']
 3. oracle_id                 | Sample: ['629fe1be-272d-465f-b9b1-2ce177410f13', 'c1882cb4-f69e-441e-8871-743ed636cad0', '405c4a7b-dbb3-48a1-9205-7dbe6e2ad363']
 4. multiverse_ids            | Sample: ['[]', '[562937]', '[690624]']
 5. mtgo_id                   | Sample: [91504.0, 100106.0, 136497.0]
 6. arena_id                  | Sample: [77106.0, 94989.0, 92355.0]
 7. tcgplayer_id              | Sample: [243201.0, 272557.0, 614328.0]
 8. cardmarket_id             | Sample: [571299.0, 658454.0, 807193.0]
 9. name                      | Sample: ['+2 Mace', 'Aarakocra Sneak', 'Aatchik, Emerald Radian']
10. lang                      | Sample: ['en', 'en', 'en']
11. released_at               | Sample: ['2021-07-23', '2022-06-10', '2025-

In [5]:
# Define enhanced column set for better recommendations
enhanced_columns = [
    # Core essentials (we already use these)
    'name', 'oracle_text', 'type_line', 'colors', 'color_identity', 'keywords',
    'mana_cost', 'cmc', 'power', 'toughness',
    
    # High-value additions for recommendations
    'rarity',                # Synergy by rarity (rare vs common themes)
    'set_name',              # Era/theme synergies (same set often has themes)
    'edhrec_rank',           # Popularity-based filtering
    'prices',                # Better price handling (usd, eur, etc.)
    'legalities',            # Format-specific filtering
    'produced_mana',         # Mana fixing synergies
    'loyalty',               # Planeswalker synergies
    'card_faces',            # Transform/flip card mechanics
    'watermark',             # Guild/faction synergies
    'flavor_text',           # Thematic flavor matching
    'artist',                # Aesthetic preferences
    'all_parts',             # Related cards (tokens, etc.)
    'frame_effects',         # Special mechanics (legendary, etc.)
    
    # Web interface improvements
    'image_uris',            # Card images for web display
    'scryfall_uri',          # Link to card details
    'related_uris'           # EDHREC links
]

# Filter to columns that exist
available_enhanced = [col for col in enhanced_columns if col in df.columns]
print(f"Enhanced dataset will have {len(available_enhanced)} columns:")
for i, col in enumerate(available_enhanced, 1):
    print(f"{i:2d}. {col}")

# Create enhanced dataset
df_enhanced = df[available_enhanced].copy()
df_clean = df_enhanced[df_enhanced['oracle_text'].notna()].copy()

# Add combined text
df_clean['combined_text'] = (
    df_clean['oracle_text'].fillna('') + ' ' + 
    df_clean['type_line'].fillna('') + ' ' +
    df_clean['keywords'].astype(str)
)

print(f"\nEnhanced clean dataset: {len(df_clean)} cards")

# Save enhanced dataset
df_clean.to_csv('data/mtg_cards_clean.csv', index=False)
print("✅ Saved enhanced dataset!")

Enhanced dataset will have 26 columns:
 1. name
 2. oracle_text
 3. type_line
 4. colors
 5. color_identity
 6. keywords
 7. mana_cost
 8. cmc
 9. power
10. toughness
11. rarity
12. set_name
13. edhrec_rank
14. prices
15. legalities
16. produced_mana
17. loyalty
18. card_faces
19. watermark
20. flavor_text
21. artist
22. all_parts
23. frame_effects
24. image_uris
25. scryfall_uri
26. related_uris

Enhanced clean dataset: 27290 cards
✅ Saved enhanced dataset!


In [15]:
import pandas as pd
import numpy as np
from collections import Counter
import re

# Load the enhanced clean dataset
df_clean = pd.read_csv('data/mtg_cards_clean.csv', low_memory=False)
print(f"Loaded enhanced dataset: {len(df_clean)} cards")

# Current keyword list
current_keywords = [
    'defender', 'flying', 'vigilance', 'deathtouch', 'lifelink', 'trample',
    'haste', 'first strike', 'double strike', 'hexproof', 'indestructible',
    'proliferate', 'counter', 'artifact', 'enchantment', 'token', 'draw',
    'graveyard', 'exile', 'sacrifice', 'destroy', 'search', 'toughness',
    'power', 'enters', 'whenever', 'combat damage'
]

def extract_missing_mtg_words(df, current_keywords, min_frequency=25):
    """Find important MTG words we're missing"""
    
    all_text = ' '.join(df['oracle_text'].fillna('').str.lower())
    
    # Extract words
    words = re.findall(r'\b[a-z]+\b', all_text)
    word_counts = Counter(words)
    
    # Common English words to exclude
    common_english = {
        'the', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'on', 'at', 'by', 'for', 
        'with', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 
        'after', 'above', 'below', 'between', 'among', 'if', 'when', 'where', 
        'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 
        'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 
        'very', 'can', 'will', 'just', 'should', 'now', 'get', 'has', 'had', 
        'have', 'he', 'she', 'it', 'they', 'them', 'their', 'what', 'which', 
        'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
        'was', 'were', 'be', 'been', 'being', 'do', 'does', 'did', 'doing', 
        'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'one', 
        'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
        'put', 'get', 'turn', 'end', 'beginning', 'your', 'you', 'its', 'until',
        'choose', 'target', 'then', 'cost', 'less', 'add', 'pay', 'spend'
    }
    
    # Find meaningful MTG words
    mtg_words = []
    for word, count in word_counts.most_common(200):
        if (count >= min_frequency and 
            word not in common_english and 
            len(word) > 2 and
            word not in current_keywords):
            mtg_words.append((word, count))
    
    return mtg_words

# Find missing keywords
missing_words = extract_missing_mtg_words(df_clean, current_keywords)

print("Top 500 missing MTG keywords/mechanics:")
print("="*50)
for i, (word, count) in enumerate(missing_words[:500]):
    print(f"{i+1:2d}. {word:18} (appears {count:4d} times)")

Loaded enhanced dataset: 27290 cards
Top 500 missing MTG keywords/mechanics:
 1. creature           (appears 33052 times)
 2. card               (appears 12353 times)
 3. control            (appears 9829 times)
 4. damage             (appears 5785 times)
 5. cast               (appears 5605 times)
 6. player             (appears 5346 times)
 7. spell              (appears 5159 times)
 8. creatures          (appears 4939 times)
 9. cards              (appears 4905 times)
10. hand               (appears 4011 times)
11. deals              (appears 3850 times)
12. gets               (appears 3816 times)
13. library            (appears 3711 times)
14. life               (appears 3516 times)
15. mana               (appears 3456 times)
16. create             (appears 3191 times)
17. land               (appears 3151 times)
18. opponent           (appears 3113 times)
19. battlefield        (appears 2817 times)
20. gain               (appears 2399 times)
21. counters           (appears 2370 time

In [12]:
# Let's categorize the missing words to build a comprehensive keyword list
def categorize_mtg_keywords(missing_words):
    """Categorize MTG keywords by type"""
    
    # Define categories
    categories = {
        'combat_abilities': [],
        'card_types': [],
        'mechanics': [],
        'tribal': [],
        'mana_land': [],
        'counters_tokens': [],
        'graveyard_exile': [],
        'spells_abilities': [],
        'other_important': []
    }
    
    # Categorization patterns (you can refine these based on the actual missing words)
    combat_patterns = ['reach', 'menace', 'protection', 'prowess', 'crew', 'flash']
    type_patterns = ['creature', 'instant', 'sorcery', 'planeswalker', 'legendary', 'basic', 'land']
    mechanic_patterns = ['cascade', 'convoke', 'delve', 'emerge', 'flashback', 'madness', 'scry', 'surveil']
    tribal_patterns = ['human', 'goblin', 'elf', 'dragon', 'beast', 'spirit', 'zombie', 'vampire']
    mana_patterns = ['mana', 'land', 'basic', 'nonbasic', 'colorless']
    counter_patterns = ['counter', 'token', 'create', 'remove']
    graveyard_patterns = ['graveyard', 'exile', 'return', 'mill']
    spell_patterns = ['spell', 'ability', 'activated ability', 'triggered ability', 'cast']
    
    # This is a simplified categorization - you'd refine based on actual results
    print("Based on the missing words above, we should add keywords like:")
    print("\n🗡️  COMBAT ABILITIES:")
    print("   reach, menace, protection, prowess, crew, flash, etc.")
    
    print("\n📜 CARD TYPES:")  
    print("   creature, instant, sorcery, planeswalker, legendary, etc.")
    
    print("\n⚙️  MECHANICS:")
    print("   cascade, convoke, delve, scry, surveil, flashback, etc.")
    
    print("\n🏘️  TRIBAL:")
    print("   human, goblin, elf, dragon, spirit, zombie, etc.")
    
    print("\n💎 MANA/LANDS:")
    print("   mana, land, basic, colorless, etc.")

categorize_mtg_keywords(missing_words)

Based on the missing words above, we should add keywords like:

🗡️  COMBAT ABILITIES:
   reach, menace, protection, prowess, crew, flash, etc.

📜 CARD TYPES:
   creature, instant, sorcery, planeswalker, legendary, etc.

⚙️  MECHANICS:
   cascade, convoke, delve, scry, surveil, flashback, etc.

🏘️  TRIBAL:
   human, goblin, elf, dragon, spirit, zombie, etc.

💎 MANA/LANDS:
   mana, land, basic, colorless, etc.


In [16]:
# Current keywords
current_keywords = [
    'defender', 'flying', 'vigilance', 'deathtouch', 'lifelink', 'trample',
    'haste', 'first strike', 'double strike', 'hexproof', 'indestructible',
    'proliferate', 'counter', 'artifact', 'enchantment', 'token', 'draw',
    'graveyard', 'exile', 'sacrifice', 'destroy', 'search', 'toughness',
    'power', 'enters', 'whenever', 'combat damage'
]

# Build expanded keyword list based on our analysis
expanded_keywords = current_keywords + [
    # Core MTG concepts (high frequency, very important)
    'creature', 'creatures', 'spell', 'spells', 'card', 'cards',
    'mana', 'land', 'lands', 'battlefield', 'hand', 'library',
    'damage', 'life', 'combat', 'attacks', 'attack', 'attacking',
    
    # Important abilities we missed
    'flash', 'menace', 'reach', 'equip', 'equipped',
    
    # Key mechanics and actions
    'cast', 'create', 'return', 'reveal', 'discard', 'shuffle',
    'tap', 'tapped', 'untap', 'control', 'controls', 'dies',
    'gains', 'loses', 'deals', 'dealt', 'blocked', 'copy',
    
    # Card types and zones
    'instant', 'sorcery', 'planeswalker', 'aura', 'permanent', 'permanents',
    'exiled', 'counters', 'tokens', 'enchanted',
    
    # Important game concepts
    'upkeep', 'step', 'owner', 'controller', 'opponent', 'opponents',
    'activate', 'ability', 'abilities', 'color', 'white', 'black', 'green', 'red',
    'nonland', 'choice', 'remove', 'unless'
]

print(f"Expanded keyword list: {len(expanded_keywords)} keywords")
print(f"Added {len(expanded_keywords) - len(current_keywords)} new keywords")

# Remove duplicates and sort
expanded_keywords = sorted(list(set(expanded_keywords)))
print(f"Final keyword count (deduplicated): {len(expanded_keywords)}")

# Show the new additions
new_keywords = [k for k in expanded_keywords if k not in current_keywords]
print(f"\n🆕 New keywords added ({len(new_keywords)}):")
for i, keyword in enumerate(new_keywords):
    print(f"{i+1:2d}. {keyword}")

Expanded keyword list: 96 keywords
Added 69 new keywords
Final keyword count (deduplicated): 96

🆕 New keywords added (69):
 1. abilities
 2. ability
 3. activate
 4. attack
 5. attacking
 6. attacks
 7. aura
 8. battlefield
 9. black
10. blocked
11. card
12. cards
13. cast
14. choice
15. color
16. combat
17. control
18. controller
19. controls
20. copy
21. counters
22. create
23. creature
24. creatures
25. damage
26. deals
27. dealt
28. dies
29. discard
30. enchanted
31. equip
32. equipped
33. exiled
34. flash
35. gains
36. green
37. hand
38. instant
39. land
40. lands
41. library
42. life
43. loses
44. mana
45. menace
46. nonland
47. opponent
48. opponents
49. owner
50. permanent
51. permanents
52. planeswalker
53. reach
54. red
55. remove
56. return
57. reveal
58. shuffle
59. sorcery
60. spell
61. spells
62. step
63. tap
64. tapped
65. tokens
66. unless
67. untap
68. upkeep
69. white


In [17]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer

# Load our data
df_clean = pd.read_csv('data/mtg_cards_clean.csv', low_memory=False)

def extract_mtg_ngrams(df, min_frequency=20):
    """Extract 1-3 word MTG phrases that appear frequently"""
    
    # Combine all oracle text
    all_text = ' '.join(df['oracle_text'].fillna(''))
    
    # Clean text for better phrase extraction
    # Remove mana symbols and some punctuation
    cleaned_text = re.sub(r'\{[^}]*\}', '', all_text)  # Remove {1}{W} style mana
    cleaned_text = re.sub(r'[(),]', ' ', cleaned_text)  # Replace punctuation with spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)    # Normalize whitespace
    
    # Extract 1-grams, 2-grams, and 3-grams
    phrases = {}
    
    for n in [1, 2, 3]:
        print(f"Extracting {n}-grams...")
        
        vectorizer = CountVectorizer(
            ngram_range=(n, n),
            lowercase=True,
            stop_words=None,  # We'll filter manually for MTG context
            min_df=min_frequency,
            max_features=200  # Top 200 for each n-gram size
        )
        
        try:
            X = vectorizer.fit_transform([cleaned_text])
            feature_names = vectorizer.get_feature_names_out()
            frequencies = X.toarray()[0]
            
            for phrase, freq in zip(feature_names, frequencies):
                if freq >= min_frequency:
                    phrases[phrase] = freq
        except:
            print(f"  Could not extract {n}-grams")
    
    return phrases

def filter_meaningful_phrases(phrases):
    """Filter out generic English and keep meaningful MTG phrases"""
    
    # Words/phrases to exclude (too generic)
    exclude_patterns = [
        r'^(the|and|or|of|to|in|on|at|by|for|with|from|up|about|into|through)$',
        r'^(a|an|this|that|these|those|it|its|you|your|each|all|any)$',
        r'^(when|where|why|how|if|then|else|until|while|during|before|after)$',
        r'^(may|can|could|would|should|must|will|shall)$',
        r'^(one|two|three|four|five|six|seven|eight|nine|ten)$',
        r'^(put|get|add|has|have|had|been|being|do|does|did)$'
    ]
    
    # MTG-specific phrases we definitely want to keep
    keep_patterns = [
        r'.*enters.*battlefield.*',
        r'.*combat.*damage.*',
        r'.*first.*strike.*',
        r'.*double.*strike.*',
        r'.*flying.*',
        r'.*creature.*token.*',
        r'.*whenever.*',
        r'.*end.*turn.*',
        r'.*beginning.*upkeep.*',
        r'.*sacrifice.*',
        r'.*destroy.*target.*',
        r'.*draw.*card.*',
        r'.*search.*library.*',
        r'.*return.*battlefield.*',
        r'.*graveyard.*',
        r'.*exile.*',
        r'.*counter.*spell.*',
        r'.*mana.*cost.*',
        r'.*plus.*plus.*',
        r'.*artifact.*creature.*',
        r'.*legendary.*creature.*'
    ]
    
    filtered_phrases = {}
    
    for phrase, freq in phrases.items():
        # Skip if matches exclude patterns
        excluded = any(re.match(pattern, phrase) for pattern in exclude_patterns)
        if excluded:
            continue
            
        # Keep if matches MTG patterns or has good MTG keywords
        mtg_keywords = ['creature', 'spell', 'mana', 'damage', 'combat', 'token', 
                       'artifact', 'enchantment', 'battlefield', 'graveyard', 'exile',
                       'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink',
                       'counter', 'destroy', 'sacrifice', 'draw', 'search', 'return']
        
        is_mtg_relevant = (
            any(re.match(pattern, phrase) for pattern in keep_patterns) or
            any(keyword in phrase for keyword in mtg_keywords) or
            len(phrase.split()) > 1  # Multi-word phrases are usually more specific
        )
        
        if is_mtg_relevant and len(phrase) > 2:  # Skip very short phrases
            filtered_phrases[phrase] = freq
    
    return filtered_phrases

# Extract all n-grams
print("Extracting MTG phrases (1-3 words)...")
all_phrases = extract_mtg_ngrams(df_clean)
print(f"Found {len(all_phrases)} total phrases")

# Filter for meaningful MTG phrases
meaningful_phrases = filter_meaningful_phrases(all_phrases)
print(f"Filtered to {len(meaningful_phrases)} meaningful MTG phrases")

# Show top phrases
sorted_phrases = sorted(meaningful_phrases.items(), key=lambda x: x[1], reverse=True)
print(f"\nTop 50 MTG phrases:")
print("="*60)
for i, (phrase, freq) in enumerate(sorted_phrases[:50]):
    print(f"{i+1:2d}. {phrase:35} (appears {freq:4d} times)")

Extracting MTG phrases (1-3 words)...
Extracting 1-grams...
  Could not extract 1-grams
Extracting 2-grams...
  Could not extract 2-grams
Extracting 3-grams...
  Could not extract 3-grams
Found 0 total phrases
Filtered to 0 meaningful MTG phrases

Top 50 MTG phrases:


In [19]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer

# Load our data
df_clean = pd.read_csv('data/mtg_cards_clean.csv', low_memory=False)

def extract_mtg_ngrams(df, min_frequency=20):
    """Extract 1-3 word MTG phrases that appear frequently"""
    
    # Prepare documents (each card's oracle text as separate document)
    documents = []
    for oracle_text in df['oracle_text'].fillna(''):
        # Clean text for better phrase extraction
        cleaned = re.sub(r'\{[^}]*\}', '', oracle_text)  # Remove mana symbols
        cleaned = re.sub(r'[(),]', ' ', cleaned)         # Replace punctuation
        cleaned = re.sub(r'\s+', ' ', cleaned.strip())   # Normalize whitespace
        if cleaned:  # Only add non-empty documents
            documents.append(cleaned)
    
    print(f"Processing {len(documents)} card texts...")
    
    all_phrases = {}
    
    # Extract n-grams of different sizes
    for n in [1, 2, 3]:
        print(f"Extracting {n}-grams...")
        
        try:
            vectorizer = CountVectorizer(
                ngram_range=(n, n),
                lowercase=True,
                min_df=min_frequency,      # Must appear in at least this many cards
                max_df=0.8,                # But not in more than 80% of cards (too common)
                max_features=300           # Top 300 for each n-gram size
            )
            
            X = vectorizer.fit_transform(documents)
            feature_names = vectorizer.get_feature_names_out()
            
            # Sum frequencies across all documents
            frequencies = X.toarray().sum(axis=0)
            
            for phrase, freq in zip(feature_names, frequencies):
                if freq >= min_frequency:
                    all_phrases[phrase] = freq
                    
            print(f"  Found {len([f for f in frequencies if f >= min_frequency])} {n}-grams")
            
        except Exception as e:
            print(f"  Error extracting {n}-grams: {e}")
    
    return all_phrases

def filter_meaningful_phrases(phrases):
    """Filter for meaningful MTG phrases"""
    
    # Generic words/phrases to exclude
    exclude_words = {
        'the', 'and', 'or', 'of', 'to', 'in', 'on', 'at', 'by', 'for', 'with', 
        'from', 'up', 'about', 'into', 'through', 'a', 'an', 'this', 'that', 
        'these', 'those', 'it', 'its', 'you', 'your', 'each', 'all', 'any',
        'when', 'where', 'why', 'how', 'if', 'then', 'else', 'until', 'while',
        'may', 'can', 'could', 'would', 'should', 'must', 'will', 'shall',
        'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
        'put', 'get', 'add', 'has', 'have', 'had', 'been', 'being', 'do', 'does', 'did'
    }
    
    # MTG-relevant keywords that make phrases valuable
    mtg_keywords = {
        'creature', 'spell', 'mana', 'damage', 'combat', 'token', 'artifact', 
        'enchantment', 'battlefield', 'graveyard', 'exile', 'flying', 'trample', 
        'vigilance', 'deathtouch', 'lifelink', 'counter', 'destroy', 'sacrifice', 
        'draw', 'search', 'return', 'enters', 'dies', 'attacks', 'blocks', 'tap',
        'untap', 'equipped', 'legendary', 'planeswalker', 'instant', 'sorcery',
        'flash', 'reach', 'menace', 'indestructible', 'hexproof', 'proliferate'
    }
    
    filtered = {}
    
    for phrase, freq in phrases.items():
        words = phrase.split()
        
        # Skip if phrase is just generic words
        if all(word in exclude_words for word in words):
            continue
        
        # Keep if contains MTG keywords or is multi-word (usually more specific)
        has_mtg_content = (
            any(word in mtg_keywords for word in words) or
            len(words) > 1 or
            any(word.endswith('ing') for word in words)  # Actions like "attacking"
        )
        
        if has_mtg_content and len(phrase) > 2:
            filtered[phrase] = freq
    
    return filtered

# Extract phrases
print("Extracting MTG phrases (1-3 words)...")
all_phrases = extract_mtg_ngrams(df_clean, min_frequency=15)  # Lower threshold
print(f"Found {len(all_phrases)} total phrases")

# Filter for meaningful ones
meaningful_phrases = filter_meaningful_phrases(all_phrases)
print(f"Filtered to {len(meaningful_phrases)} meaningful MTG phrases")

# Show top phrases
if meaningful_phrases:
    sorted_phrases = sorted(meaningful_phrases.items(), key=lambda x: x[1], reverse=True)
    print(f"\nTop 50 MTG phrases:")
    print("="*70)
    for i, (phrase, freq) in enumerate(sorted_phrases[:50]):
        print(f"{i+1:2d}. {phrase:40} (appears {freq:4d} times)")
else:
    print("No meaningful phrases found - may need to adjust parameters")

Extracting MTG phrases (1-3 words)...
Processing 27290 card texts...
Extracting 1-grams...
  Found 300 1-grams
Extracting 2-grams...
  Found 300 2-grams
Extracting 3-grams...
  Found 300 3-grams
Found 900 total phrases
Filtered to 581 meaningful MTG phrases

Top 50 MTG phrases:
 1. creature                                 (appears 33052 times)
 2. this creature                            (appears 12952 times)
 3. you control                              (appears 7934 times)
 4. enters                                   (appears 6445 times)
 5. damage                                   (appears 5785 times)
 6. end of                                   (appears 5489 times)
 7. until end                                (appears 5284 times)
 8. until end of                             (appears 5284 times)
 9. of turn                                  (appears 5277 times)
10. end of turn                              (appears 5276 times)
11. target creature                          (appears 5275 

In [24]:
# Let's create a much more comprehensive keyword list combining what we found + manual additions
def create_comprehensive_keyword_list():
    """Create comprehensive 1-3 word MTG keyword list"""
    
    # Start with our discovered high-frequency phrases
    discovered_phrases = [
        'creature', 'this creature', 'you control', 'enters', 'damage', 'end of turn',
        'until end of turn', 'target creature', 'spell', 'counter', 'flying', 'graveyard',
        'artifact', 'damage to', 'sacrifice', 'token', 'when this creature', 'draw',
        'mana', 'creature enters', 'this creature enters', 'exile', 'your graveyard',
        'counter on', 'creature gets', 'your library', 'creature you', 'battlefield',
        'the battlefield', 'deals damage', 'beginning', 'beginning of', 'at the beginning',
        'draw card', 'your hand', 'this turn', 'return', 'creature you control',
        'deals damage to', 'card from', 'this spell', 'this card', 'cast this', 'combat',
        'triggered ability', 'activated ability'
    ]
    
    # Add essential combat abilities
    combat_abilities = [
        'flying', 'trample', 'vigilance', 'deathtouch', 'lifelink', 'haste',
        'first strike', 'double strike', 'hexproof', 'indestructible', 'defender',
        'reach', 'menace', 'protection', 'prowess', 'flash', 'crew', 'skulk', 'shadow'
    ]
    
    # Add important multi-word combat phrases
    combat_phrases = [
        'first strike', 'double strike', 'combat damage', 'attacking creature',
        'blocking creature', 'deals combat damage', 'when attacks', 'when blocks',
        'whenever attacks', 'whenever blocks', 'attack alone', 'can attack',
        'must attack', 'cannot attack', 'can block', 'cannot block',
        'creature attacking'
    ]
    
    # Add triggered ability patterns
    triggered_abilities = [
        'whenever', 'when enters', 'when dies', 'when attacks', 'when blocks',
        'at the beginning', 'at end of', 'whenever you cast', 'whenever you draw',
        'whenever deals damage', 'whenever takes damage', 'when you gain life',
        'when you lose life', 'whenever discards', 'whenever sacrifices',
        'triggers an additional'
    ]
    
    # Add important card type combinations
    card_types = [
        'artifact creature', 'enchantment creature', 'legendary creature',
        'legendary artifact', 'legendary enchantment', 'legendary planeswalker',
        'basic land', 'nonbasic land', 'creature token', 'artifact token',
        'instant spell', 'sorcery spell', 'creature spell', 'noncreature spell',
        'permanent'
    ]
    
    # Add mana and cost related phrases
    mana_phrases = [
        'mana cost', 'mana value', 'converted mana', 'additional cost',
        'without paying', 'costs less', 'costs more', 'add mana',
        'pay mana', 'spend mana', 'colorless mana', 'colored mana'
    ]
    
    # Add +1/+1 counter and proliferate synergies  
    counter_phrases = [
        'counter', 'counters', '+1/+1 counter', '-1/-1 counter', 'loyalty counter',
        'charge counter', 'time counter', 'proliferate', 'put counter',
        'remove counter', 'counter on it', 'with counters', 'number of counters'
    ]
    
    # Add tribal synergies
    tribal_keywords = [
        'human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel', 'demon',
        'beast', 'spirit', 'elemental', 'wizard', 'warrior', 'soldier', 'knight',
        'rogue', 'cleric', 'shaman', 'merfolk', 'insect', 'cat', 'dog', 'bird',
        'merfolk'
    ]
    
    # Add zone and library manipulation
    zone_phrases = [
        'from graveyard', 'to graveyard', 'from exile', 'to exile',
        'from your hand', 'to your hand', 'from library', 'search library',
        'shuffle library', 'top of library', 'bottom of library',
        'enters battlefield', 'leaves battlefield', 'return to battlefield'
    ]
    
    # Add sacrifice and destruction themes
    sacrifice_phrases = [
        'sacrifice', 'destroy', 'destroy target', 'sacrifice creature',
        'sacrifice artifact', 'sacrifice enchantment', 'sacrifice land',
        'when sacrificed', 'when destroyed', 'destroy all', 'sacrifice all'
    ]
    
    # Add draw and card advantage
    card_advantage = [
        'draw card', 'draw cards', 'draw additional', 'discard card',
        'discard cards', 'reveal card', 'reveal cards', 'look at',
        'search for', 'return to hand', 'mill cards', 'exile cards'
    ]
    
    # Combine all lists
    comprehensive_keywords = (
        discovered_phrases + combat_abilities + combat_phrases + 
        triggered_abilities + card_types + mana_phrases + 
        counter_phrases + tribal_keywords + zone_phrases + 
        sacrifice_phrases + card_advantage
    )
    
    # Remove duplicates and sort
    comprehensive_keywords = sorted(list(set(comprehensive_keywords)))
    
    return comprehensive_keywords

# Create the comprehensive keyword list
comprehensive_keywords = create_comprehensive_keyword_list()
print(f"Comprehensive keyword list: {len(comprehensive_keywords)} total keywords")

# Show categories
print("\n📝 Sample keywords by category:")
print("Combat:", [k for k in comprehensive_keywords if any(word in k for word in ['strike', 'combat', 'attack', 'block', 'flying', 'trample'])][:10])
print("Triggered:", [k for k in comprehensive_keywords if any(word in k for word in ['when', 'whenever', 'beginning', 'end'])][:8])
print("Counters:", [k for k in comprehensive_keywords if 'counter' in k][:8])
print("Tribal:", [k for k in comprehensive_keywords if k in ['human', 'elf', 'goblin', 'zombie', 'vampire', 'dragon', 'angel']])

print(f"\n🎯 Total comprehensive keywords: {len(comprehensive_keywords)}")

Comprehensive keyword list: 189 total keywords

📝 Sample keywords by category:
Combat: ['attack alone', 'attacking creature', 'blocking creature', 'can attack', 'can block', 'cannot attack', 'cannot block', 'combat', 'combat damage', 'creature attacking']
Triggered: ['at end of', 'at the beginning', 'beginning', 'beginning of', 'defender', 'end of turn', 'legendary artifact', 'legendary creature']
Counters: ['+1/+1 counter', '-1/-1 counter', 'charge counter', 'counter', 'counter on', 'counter on it', 'counters', 'loyalty counter']
Tribal: ['angel', 'dragon', 'elf', 'goblin', 'human', 'vampire', 'zombie']

🎯 Total comprehensive keywords: 189


In [25]:
def create_comprehensive_keyword_features(df, keywords):
    """Create keyword features with comprehensive multi-word list"""
    keyword_matrix = []
    
    print(f"Creating features with {len(keywords)} keywords...")
    
    for idx, row in df.iterrows():
        if idx % 5000 == 0:
            print(f"  Processing card {idx}/{len(df)}")
            
        text = row['combined_text'].lower()
        features = []
        
        for keyword in keywords:
            # Count occurrences (cap at 3)
            count = text.count(keyword.lower())
            features.append(min(count, 3))
            
        keyword_matrix.append(features)
    
    return np.array(keyword_matrix), keywords

# Create comprehensive keyword matrix
print("Creating comprehensive keyword matrix...")
comprehensive_matrix, final_keywords = create_comprehensive_keyword_features(df_clean, comprehensive_keywords)
print(f"Matrix shape: {comprehensive_matrix.shape}")

# Test with Isshin and other commanders
test_commanders = ['Isshin']

for commander_name in test_commanders:
    matches = df_clean[df_clean['name'].str.contains(commander_name, case=False, na=False)]
    if len(matches) > 0:
        idx = matches.index[0]
        features = comprehensive_matrix[idx]
        matched_keywords = [final_keywords[i] for i in range(len(final_keywords)) if features[i] > 0]
        
        print(f"\n⚔️ {matches.iloc[0]['name']}:")
        print(f"   Matched keywords ({len(matched_keywords)}): {matched_keywords[:15]}...")
        print(f"   Total feature count: {sum(features)}")
    else:
        print(f"\n❌ {commander_name} not found")

Creating comprehensive keyword matrix...
Creating features with 189 keywords...
  Processing card 0/27290
  Processing card 5000/27290
  Processing card 10000/27290
  Processing card 15000/27290
  Processing card 20000/27290
  Processing card 25000/27290
Matrix shape: (27290, 189)

⚔️ Isshin, Two Heavens as One:
   Matched keywords (8): ['creature', 'creature attacking', 'human', 'legendary creature', 'permanent', 'triggered ability', 'triggers an additional', 'you control']...
   Total feature count: 9


In [26]:
# Save the enhanced model with comprehensive keywords
import pickle

print("Saving enhanced model with comprehensive keywords...")

# Prepare model data
enhanced_model_data = {
    'df_clean': df_clean,
    'keyword_matrix': comprehensive_matrix,
    'keywords': final_keywords,
    'model_version': 'v2_comprehensive_keywords',
    'total_keywords': len(final_keywords),
    'total_cards': len(df_clean)
}

# Save to file
with open('data/mtg_model_enhanced.pkl', 'wb') as f:
    pickle.dump(enhanced_model_data, f)

print(f"✅ Enhanced model saved!")
print(f"   - {len(df_clean)} cards")
print(f"   - {len(final_keywords)} keywords (vs 28 original)")
print(f"   - Matrix size: {comprehensive_matrix.shape}")

# Save keyword list as text file for reference
with open('data/comprehensive_keywords.txt', 'w') as f:
    for i, keyword in enumerate(final_keywords):
        f.write(f"{i+1:3d}. {keyword}\n")

print("✅ Keyword list saved to comprehensive_keywords.txt")

Saving enhanced model with comprehensive keywords...
✅ Enhanced model saved!
   - 27290 cards
   - 189 keywords (vs 28 original)
   - Matrix size: (27290, 189)
✅ Keyword list saved to comprehensive_keywords.txt


In [28]:
# Create updated Flask app code
updated_flask_code = '''
from flask import Flask, render_template, request, jsonify
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast

app = Flask(__name__)

# Color identity functions
def get_color_identity(colors_string):
    if pd.isna(colors_string) or colors_string == '[]':
        return set()
    try:
        colors_list = ast.literal_eval(colors_string)
        return set(colors_list) if colors_list else set()
    except:
        return set()

def is_legal_in_deck(card_colors_string, commander_colors_string):
    card_identity = get_color_identity(card_colors_string)
    commander_identity = get_color_identity(commander_colors_string)
    return card_identity.issubset(commander_identity)

# Load enhanced model
print("Loading enhanced ML model...")
try:
    with open('data/mtg_model_enhanced.pkl', 'rb') as f:
        model_data = pickle.load(f)
    
    df_clean = model_data['df_clean']
    keyword_matrix = model_data['keyword_matrix']
    keywords = model_data['keywords']
    
    print(f"Enhanced model loaded!")
    print(f"  - {len(df_clean)} cards")
    print(f"  - {len(keywords)} keywords")
    print(f"  - Model version: {model_data.get('model_version', 'unknown')}")
    
except FileNotFoundError:
    print("Enhanced model not found, falling back to basic model...")
    with open('data/mtg_model.pkl', 'rb') as f:
        model_data = pickle.load(f)
    df_clean = model_data['df_clean']
    keyword_matrix = model_data['keyword_matrix']
    keywords = model_data['keywords']

def find_recommendations(commander_name, num_recommendations=10):
    """Enhanced recommendation function"""
    # Find commander
    card_matches = df_clean[df_clean['name'].str.contains(commander_name, case=False, na=False)]
    
    if len(card_matches) == 0:
        return {"error": f"Commander '{commander_name}' not found"}
    
    commander_row = card_matches.iloc[0]
    commander_idx = card_matches.index[0]
    commander_colors = commander_row['colors']
    commander_vector = keyword_matrix[commander_idx].reshape(1, -1)
    
    # Calculate similarities
    similarities = cosine_similarity(commander_vector, keyword_matrix).flatten()
    all_indices = similarities.argsort()[::-1]
    
    # Filter for legal cards with non-zero similarity
    results = []
    for idx in all_indices:
        if idx == commander_idx:
            continue
            
        card_row = df_clean.iloc[idx]
        similarity = similarities[idx]
        
        # Only include cards with some similarity and legal color identity
        if similarity > 0 and is_legal_in_deck(card_row['colors'], commander_colors):
            results.append({
                'name': card_row['name'],
                'similarity': float(similarity),
                'type': card_row['type_line'],
                'colors': list(get_color_identity(card_row['colors'])),
                'text': card_row['oracle_text'][:200] + "..." if len(card_row['oracle_text']) > 200 else card_row['oracle_text'],
                'mana_cost': card_row.get('mana_cost', ''),
                'rarity': card_row.get('rarity', 'unknown')
            })
            
            if len(results) >= num_recommendations:
                break
    
    return {
        "commander": {
            "name": commander_row['name'],
            "colors": list(get_color_identity(commander_colors)),
            "type": commander_row['type_line'],
            "text": commander_row['oracle_text']
        },
        "recommendations": results,
        "model_info": {
            "keywords_used": len(keywords),
            "version": "Enhanced Multi-Word Keywords"
        }
    }

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/recommend', methods=['POST'])
def recommend():
    data = request.get_json()
    commander = data.get('commander', '')
    num_recs = data.get('num_recommendations', 10)
    
    result = find_recommendations(commander, num_recs)
    return jsonify(result)

@app.route('/model_info')
def model_info():
    return jsonify({
        "total_cards": len(df_clean),
        "total_keywords": len(keywords),
        "sample_keywords": keywords[:20],
        "model_version": "Enhanced Multi-Word Keywords v2"
    })

if __name__ == '__main__':
    app.run(debug=True, port=5000)
'''

# Save updated Flask app
with open('app_enhanced.py', 'w') as f:
    f.write(updated_flask_code)

print("✅ Enhanced Flask app saved as 'app_enhanced.py'")

✅ Enhanced Flask app saved as 'app_enhanced.py'


In [31]:
# Import the missing function
from sklearn.metrics.pairwise import cosine_similarity

# Quick test of the enhanced recommendation system
def test_enhanced_recommendations(commander_name):
    """Test function to see improvement"""
    card_matches = df_clean[df_clean['name'].str.contains(commander_name, case=False, na=False)]
    
    if len(card_matches) == 0:
        print(f"❌ {commander_name} not found")
        return
    
    commander_row = card_matches.iloc[0]
    commander_idx = card_matches.index[0]
    commander_vector = comprehensive_matrix[commander_idx].reshape(1, -1)
    
    # Calculate similarities
    similarities = cosine_similarity(commander_vector, comprehensive_matrix).flatten()
    
    # Show stats
    non_zero_similarities = similarities[similarities > 0]
    print(f"🎯 {commander_row['name']} Enhanced Results:")
    print(f"   Cards with similarity > 0: {len(non_zero_similarities)}/{len(similarities)}")
    print(f"   Average similarity: {non_zero_similarities.mean():.3f}")
    print(f"   Max similarity: {similarities.max():.3f}")
    
    # Show top 5 most similar cards for reference
    top_indices = similarities.argsort()[-6:-1][::-1]  # Top 5 (excluding self)
    print(f"   Top 5 similar cards:")
    for i, idx in enumerate(top_indices):
        if idx != commander_idx:
            card_name = df_clean.iloc[idx]['name']
            similarity = similarities[idx]
            print(f"     {i+1}. {card_name} (similarity: {similarity:.3f})")

# Test with our problem commanders
test_commanders = ['Arcades, the Strategist']
for commander in test_commanders:
    test_enhanced_recommendations(commander)
    print()

🎯 Arcades, the Strategist Enhanced Results:
   Cards with similarity > 0: 25432/27290
   Average similarity: 0.313
   Max similarity: 1.000
   Top 5 similar cards:
     1. Overgrown Battlement (similarity: 0.766)
     2. High Alert (similarity: 0.764)
     3. Assault Formation (similarity: 0.758)
     4. Saruli Caretaker (similarity: 0.758)
     5. The Pride of Hull Clade (similarity: 0.752)



In [32]:
# Add info about the enhancement to README
readme_addition = '''

## 🆕 Recent Enhancements

### v2.0 - Comprehensive Multi-Word Keyword System
- **Expanded Keyword Coverage**: From 28 to 100+ keywords including multi-word phrases
- **Better Phrase Matching**: Now recognizes "enters the battlefield", "combat damage", "+1/+1 counter", etc.
- **Improved Recommendations**: Dramatically better suggestions for complex commanders
- **Zero Similarity Fix**: Resolved issues where commanders got no meaningful recommendations
- **Enhanced Tribal Support**: Better recognition of creature types and tribal synergies

### Key Improvements:
- **Multi-word phrases**: "first strike", "artifact creature", "whenever you cast"
- **Triggered abilities**: "when enters", "at the beginning of", "whenever attacks"  
- **Combat mechanics**: "combat damage", "attacking creature", "blocking creature"
- **Counter synergies**: "+1/+1 counter", "proliferate", "counter on it"
- **Tribal keywords**: All major creature types (Human, Elf, Goblin, etc.)

## 🎯 Examples of Improved Recommendations

**Before**: Isshin returned cards with 0% similarity  
**After**: Isshin now finds relevant combat and triggered ability synergies

**Before**: Limited to single-word matches like "flying" or "artifact"  
**After**: Recognizes complex phrases like "whenever this creature attacks"
'''

# You could add this to your README if you want
print("Enhancement summary for README:")
print(readme_addition)

Enhancement summary for README:


## 🆕 Recent Enhancements

### v2.0 - Comprehensive Multi-Word Keyword System
- **Expanded Keyword Coverage**: From 28 to 100+ keywords including multi-word phrases
- **Better Phrase Matching**: Now recognizes "enters the battlefield", "combat damage", "+1/+1 counter", etc.
- **Improved Recommendations**: Dramatically better suggestions for complex commanders
- **Zero Similarity Fix**: Resolved issues where commanders got no meaningful recommendations
- **Enhanced Tribal Support**: Better recognition of creature types and tribal synergies

### Key Improvements:
- **Multi-word phrases**: "first strike", "artifact creature", "whenever you cast"
- **Triggered abilities**: "when enters", "at the beginning of", "whenever attacks"  
- **Combat mechanics**: "combat damage", "attacking creature", "blocking creature"
- **Counter synergies**: "+1/+1 counter", "proliferate", "counter on it"
- **Tribal keywords**: All major creature types (Human, Elf, Goblin, etc.)

In [33]:
# First, let's see what image data we have available
import pandas as pd
import json

# Load our clean data and examine image fields
df_clean = pd.read_csv('data/mtg_cards_clean.csv', low_memory=False)

# Check image-related columns
image_columns = [col for col in df_clean.columns if 'image' in col.lower()]
print("Image-related columns:")
for col in image_columns:
    print(f"  {col}")

# Examine the image_uris column structure
sample_image_uris = df_clean['image_uris'].dropna().iloc[0]
print(f"\nSample image_uris structure:")
print(sample_image_uris)

# Try to parse it
try:
    import ast
    parsed_images = ast.literal_eval(sample_image_uris)
    print(f"\nAvailable image sizes:")
    for size, url in parsed_images.items():
        print(f"  {size}: {url[:60]}...")
except Exception as e:
    print(f"Error parsing image URIs: {e}")

Image-related columns:
  image_uris

Sample image_uris structure:
{'small': 'https://cards.scryfall.io/small/front/e/8/e882c9f9-bf30-46b6-bedc-379d2c80e5cb.jpg?1627701221', 'normal': 'https://cards.scryfall.io/normal/front/e/8/e882c9f9-bf30-46b6-bedc-379d2c80e5cb.jpg?1627701221', 'large': 'https://cards.scryfall.io/large/front/e/8/e882c9f9-bf30-46b6-bedc-379d2c80e5cb.jpg?1627701221', 'png': 'https://cards.scryfall.io/png/front/e/8/e882c9f9-bf30-46b6-bedc-379d2c80e5cb.png?1627701221', 'art_crop': 'https://cards.scryfall.io/art_crop/front/e/8/e882c9f9-bf30-46b6-bedc-379d2c80e5cb.jpg?1627701221', 'border_crop': 'https://cards.scryfall.io/border_crop/front/e/8/e882c9f9-bf30-46b6-bedc-379d2c80e5cb.jpg?1627701221'}

Available image sizes:
  small: https://cards.scryfall.io/small/front/e/8/e882c9f9-bf30-46b6...
  normal: https://cards.scryfall.io/normal/front/e/8/e882c9f9-bf30-46b...
  large: https://cards.scryfall.io/large/front/e/8/e882c9f9-bf30-46b6...
  png: https://cards.scryfall.io/png/

In [34]:
# Create functions to extract image URLs
def get_card_image_url(row, size='normal'):
    """Extract card image URL from the image_uris field"""
    try:
        if pd.isna(row['image_uris']):
            return None
            
        # Parse the image_uris string
        image_dict = ast.literal_eval(row['image_uris'])
        
        # Return requested size, with fallbacks
        size_priority = [size, 'normal', 'small', 'large']
        for sz in size_priority:
            if sz in image_dict:
                return image_dict[sz]
        
        # If none found, return any available URL
        return list(image_dict.values())[0] if image_dict else None
        
    except Exception as e:
        print(f"Error getting image for {row.get('name', 'unknown')}: {e}")
        return None

# Test the function
test_card = df_clean.iloc[0]
test_image = get_card_image_url(test_card)
print(f"Test image URL: {test_image}")

Test image URL: https://cards.scryfall.io/normal/front/e/8/e882c9f9-bf30-46b6-bedc-379d2c80e5cb.jpg?1627701221


In [35]:
# Create enhanced Flask app with card images
enhanced_flask_with_images = '''
from flask import Flask, render_template, request, jsonify
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast

app = Flask(__name__)

# Color identity functions
def get_color_identity(colors_string):
    if pd.isna(colors_string) or colors_string == '[]':
        return set()
    try:
        colors_list = ast.literal_eval(colors_string)
        return set(colors_list) if colors_list else set()
    except:
        return set()

def is_legal_in_deck(card_colors_string, commander_colors_string):
    card_identity = get_color_identity(card_colors_string)
    commander_identity = get_color_identity(commander_colors_string)
    return card_identity.issubset(commander_identity)

def get_card_image_url(row, size='normal'):
    """Extract card image URL from the image_uris field"""
    try:
        if pd.isna(row.get('image_uris')):
            return None
            
        # Parse the image_uris string
        image_dict = ast.literal_eval(row['image_uris'])
        
        # Return requested size, with fallbacks
        size_priority = [size, 'normal', 'small', 'large']
        for sz in size_priority:
            if sz in image_dict:
                return image_dict[sz]
        
        # If none found, return any available URL
        return list(image_dict.values())[0] if image_dict else None
        
    except Exception as e:
        return None

# Load enhanced model
print("Loading enhanced ML model...")
try:
    with open('data/mtg_model_enhanced.pkl', 'rb') as f:
        model_data = pickle.load(f)
    
    df_clean = model_data['df_clean']
    keyword_matrix = model_data['keyword_matrix']
    keywords = model_data['keywords']
    
    print(f"Enhanced model loaded!")
    print(f"  - {len(df_clean)} cards")
    print(f"  - {len(keywords)} keywords")
    
except FileNotFoundError:
    print("Enhanced model not found, falling back to basic model...")
    with open('data/mtg_model.pkl', 'rb') as f:
        model_data = pickle.load(f)
    df_clean = model_data['df_clean']
    keyword_matrix = model_data['keyword_matrix']
    keywords = model_data['keywords']

def find_recommendations(commander_name, num_recommendations=10):
    """Enhanced recommendation function with images"""
    # Find commander
    card_matches = df_clean[df_clean['name'].str.contains(commander_name, case=False, na=False)]
    
    if len(card_matches) == 0:
        return {"error": f"Commander '{commander_name}' not found"}
    
    commander_row = card_matches.iloc[0]
    commander_idx = card_matches.index[0]
    commander_colors = commander_row['colors']
    commander_vector = keyword_matrix[commander_idx].reshape(1, -1)
    
    # Calculate similarities
    similarities = cosine_similarity(commander_vector, keyword_matrix).flatten()
    all_indices = similarities.argsort()[::-1]
    
    # Filter for legal cards with non-zero similarity
    results = []
    for idx in all_indices:
        if idx == commander_idx:
            continue
            
        card_row = df_clean.iloc[idx]
        similarity = similarities[idx]
        
        # Only include cards with some similarity and legal color identity
        if similarity > 0 and is_legal_in_deck(card_row['colors'], commander_colors):
            card_result = {
                'name': card_row['name'],
                'similarity': float(similarity),
                'type': card_row['type_line'],
                'colors': list(get_color_identity(card_row['colors'])),
                'text': card_row['oracle_text'][:200] + "..." if len(card_row['oracle_text']) > 200 else card_row['oracle_text'],
                'mana_cost': card_row.get('mana_cost', ''),
                'rarity': card_row.get('rarity', 'unknown'),
                'image_url': get_card_image_url(card_row, 'normal'),
                'scryfall_url': card_row.get('scryfall_uri', '')
            }
            results.append(card_result)
            
            if len(results) >= num_recommendations:
                break
    
    return {
        "commander": {
            "name": commander_row['name'],
            "colors": list(get_color_identity(commander_colors)),
            "type": commander_row['type_line'],
            "text": commander_row['oracle_text'],
            "image_url": get_card_image_url(commander_row, 'normal'),
            "scryfall_url": commander_row.get('scryfall_uri', '')
        },
        "recommendations": results,
        "model_info": {
            "keywords_used": len(keywords),
            "version": "Enhanced Multi-Word Keywords with Images"
        }
    }

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/recommend', methods=['POST'])
def recommend():
    data = request.get_json()
    commander = data.get('commander', '')
    num_recs = data.get('num_recommendations', 10)
    
    result = find_recommendations(commander, num_recs)
    return jsonify(result)

if __name__ == '__main__':
    app.run(debug=True, port=5000)
'''

# Save the enhanced Flask app
with open('app_with_images.py', 'w') as f:
    f.write(enhanced_flask_with_images)

print("✅ Enhanced Flask app with images created!")

✅ Enhanced Flask app with images created!
