In [5]:
# MTG Commander Recommendation System - Step 3: Data Preprocessing and Feature Engineering

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import ast
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

print("🃏 MTG Commander Recommendation System - Step 3")
print("🔧 Data Preprocessing and Feature Engineering")
print("=" * 60)

# Download required NLTK data
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    print("✅ NLTK data downloaded")
except:
    print("⚠️ NLTK download failed - proceeding without stopwords")

# =============================================================================
# 1. LOAD PREPROCESSED DATA
# =============================================================================

print("\n1️⃣ LOADING PREPROCESSED DATA...")

# Load the joined data from Step 2
try:
    df = pd.read_csv('data/processed/recommendations_with_features.csv')
    print(f"✅ Loaded joined data: {df.shape[0]:,} rows, {df.shape[1]} columns")
except FileNotFoundError:
    print("❌ Could not find recommendations_with_features.csv")
    print("Please run Step 2 (data exploration) first to create this file")
    raise

# Load original creatures data for the full candidate pool
creatures_df = pd.read_csv('data/raw/all_creatures_clean.csv')
print(f"✅ Loaded full creatures data: {creatures_df.shape[0]:,} creatures")

# =============================================================================
# 2. DATA CLEANING AND VALIDATION
# =============================================================================

print("\n2️⃣ DATA CLEANING AND VALIDATION...")

# Remove rows with missing critical data
print("🧹 Cleaning data...")
initial_count = len(df)

# Keep only rows with valid joins
df_clean = df.dropna(subset=['oracle_text', 'color_identity_commander', 'color_identity_recommended'])
print(f"Removed {initial_count - len(df_clean):,} rows with missing critical data")

# Updated parse functions for new format
def safe_parse_list(val):
    """Safely parse string representation of list - Updated for new unquoted format"""
    try:
        if pd.isna(val) or val == '[]' or val == '':
            return []
        if isinstance(val, str):
            # Remove brackets and any extra spaces
            val = val.strip()
            if val.startswith('[') and val.endswith(']'):
                val = val[1:-1]  # Remove brackets
            if not val:
                return []
            # Split by comma and clean up spaces, removing empty strings
            return [item.strip() for item in val.split(',') if item.strip()]
        return val if isinstance(val, list) else []
    except:
        return []

# Parse all list fields with updated parser
df_clean['commander_colors'] = df_clean['color_identity_commander'].apply(safe_parse_list)
df_clean['recommended_colors'] = df_clean['color_identity_recommended'].apply(safe_parse_list)
df_clean['keywords_list'] = df_clean['keywords'].apply(safe_parse_list)

# Parse secondary types - Now using array format
df_clean['secondary_types_list'] = df_clean['secondary_type'].apply(safe_parse_list)

print(f"✅ Cleaned data: {len(df_clean):,} valid recommendations")

# =============================================================================
# 3. COLOR IDENTITY VALIDATION
# =============================================================================

print("\n3️⃣ VALIDATING COLOR IDENTITY RULES...")

def is_valid_color_identity(recommended_colors, commander_colors):
    """Check if recommended creature colors are subset of commander colors"""
    if not recommended_colors:  # Colorless is always valid
        return True
    if not commander_colors:  # Colorless commander can only have colorless creatures
        return False
    return set(recommended_colors).issubset(set(commander_colors))

# Validate color identity rules
df_clean['color_valid'] = df_clean.apply(
    lambda row: is_valid_color_identity(row['recommended_colors'], row['commander_colors']), 
    axis=1
)

valid_count = df_clean['color_valid'].sum()
total_count = len(df_clean)
print(f"Color identity compliance: {valid_count:,}/{total_count:,} ({valid_count/total_count*100:.1f}%)")

# Keep only valid recommendations for training
df_valid = df_clean[df_clean['color_valid']].copy()
print(f"✅ Training data: {len(df_valid):,} valid recommendations")

# =============================================================================
# 4. ORACLE TEXT PREPROCESSING
# =============================================================================

print("\n4️⃣ PREPROCESSING ORACLE TEXT...")

def clean_oracle_text(text):
    """Clean and normalize oracle text for ML processing"""
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove mana cost symbols (e.g., {2}{R}{G})
    text = re.sub(r'\{[^}]*\}', ' ', text)
    
    # Remove reminder text (text in parentheses)
    text = re.sub(r'\([^)]*\)', ' ', text)
    
    # Replace common MTG shorthand
    replacements = {
        'creature': 'creature',
        'instant': 'instant',
        'sorcery': 'sorcery',
        'artifact': 'artifact',
        'enchantment': 'enchantment',
        'planeswalker': 'planeswalker',
        'enters the battlefield': 'etb',
        'end of turn': 'eot',
        'beginning of': 'beginning',
    }
    
    for old, new in replacements.items():
        text = text.replace(old, new)
    
    # Remove extra whitespace and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# Clean oracle text
print("🧹 Cleaning oracle text...")
df_valid['oracle_text_clean'] = df_valid['oracle_text'].apply(clean_oracle_text)

# Remove empty oracle texts
df_valid = df_valid[df_valid['oracle_text_clean'] != ''].copy()
print(f"✅ Oracle text cleaned: {len(df_valid):,} recommendations with valid text")

# Sample cleaned oracle text
print("\nSample cleaned oracle text:")
for i, text in enumerate(df_valid['oracle_text_clean'].head(3)):
    print(f"  {i+1}. {text[:100]}...")

# =============================================================================
# 5. POWER/TOUGHNESS AND PRICE PREPROCESSING
# =============================================================================

print("\n5️⃣ PREPROCESSING POWER/TOUGHNESS AND PRICES...")

def clean_power_toughness(val):
    """Clean power/toughness values (handle *, X, etc.)"""
    try:
        if pd.isna(val) or val in ['*', 'X']:
            return 0  # Default for variable P/T
        return float(val)
    except:
        return 0

# Clean power/toughness
df_valid['power_clean'] = df_valid['power'].apply(clean_power_toughness)
df_valid['toughness_clean'] = df_valid['toughness'].apply(clean_power_toughness)

print(f"Power range: {df_valid['power_clean'].min():.0f} to {df_valid['power_clean'].max():.0f}")
print(f"Toughness range: {df_valid['toughness_clean'].min():.0f} to {df_valid['toughness_clean'].max():.0f}")

# Clean prices (handle missing values)
def clean_price(val):
    """Clean price values - preserve nulls"""
    try:
        if pd.isna(val):
            return np.nan  # Keep as null instead of 0
        return float(val)
    except:
        return np.nan  # Return null for any conversion errors

# Clean commander and recommended creature prices
if 'usd_price_commander' in df_valid.columns:
    df_valid['commander_price_clean'] = df_valid['usd_price_commander'].apply(clean_price)
    print(f"Commander price range: ${df_valid['commander_price_clean'].min():.2f} to ${df_valid['commander_price_clean'].max():.2f}")

if 'usd_price_recommended' in df_valid.columns:
    df_valid['recommended_price_clean'] = df_valid['usd_price_recommended'].apply(clean_price)
    print(f"Recommended creature price range: ${df_valid['recommended_price_clean'].min():.2f} to ${df_valid['recommended_price_clean'].max():.2f}")

# =============================================================================
# 6. FEATURE ENGINEERING - ORACLE TEXT EMBEDDINGS
# =============================================================================

print("\n6️⃣ CREATING ORACLE TEXT EMBEDDINGS...")

# Create TF-IDF vectors for oracle text
print("📝 Creating TF-IDF vectors...")
tfidf = TfidfVectorizer(
    max_features=1000,  # Limit vocabulary size
    stop_words='english',
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,  # Must appear in at least 2 documents
    max_df=0.8  # Ignore terms that appear in >80% of documents
)

# Fit TF-IDF on all oracle texts
oracle_texts = df_valid['oracle_text_clean'].fillna('')
tfidf_matrix = tfidf.fit_transform(oracle_texts)

print(f"✅ TF-IDF matrix: {tfidf_matrix.shape[0]:,} documents × {tfidf_matrix.shape[1]:,} features")

# Store feature names for later use
tfidf_feature_names = tfidf.get_feature_names_out()
print(f"Sample TF-IDF features: {list(tfidf_feature_names[:10])}")

# =============================================================================
# 7. ANALYZE COMMANDER-SPECIFIC PATTERNS
# =============================================================================

print("\n7️⃣ ANALYZING COMMANDER-SPECIFIC PATTERNS...")

def analyze_commander_patterns(commander_name, df):
    """Analyze patterns for a specific commander"""
    commander_recs = df[df['commander_name'] == commander_name].copy()
    
    if len(commander_recs) == 0:
        return None
    
    patterns = {
        'commander': commander_name,
        'total_recommendations': len(commander_recs),
        'avg_synergy': commander_recs['synergy_percentage'].mean(),
        'keywords': [],
        'secondary_types': [],
        'avg_recommended_price': None,
        'price_range': None
    }
    
    # Add price analysis if available
    if 'recommended_price_clean' in commander_recs.columns:
        patterns['avg_recommended_price'] = commander_recs['recommended_price_clean'].mean()
        patterns['price_range'] = (
            commander_recs['recommended_price_clean'].min(),
            commander_recs['recommended_price_clean'].max()
        )
    
    # Analyze keywords
    all_keywords = []
    for keywords in commander_recs['keywords_list']:
        all_keywords.extend(keywords)
    
    if all_keywords:
        keyword_counts = Counter(all_keywords)
        patterns['keywords'] = keyword_counts.most_common(10)
    
    # Analyze secondary types
    all_secondary_types = []
    for types in commander_recs['secondary_types_list']:
        all_secondary_types.extend(types)
    
    if all_secondary_types:
        type_counts = Counter(all_secondary_types)
        patterns['secondary_types'] = type_counts.most_common(10)
    
    return patterns

# Analyze patterns for all commanders
print("🔍 Analyzing patterns for all commanders...")
commander_patterns = {}
commanders = df_valid['commander_name'].unique()

for commander in commanders:
    patterns = analyze_commander_patterns(commander, df_valid)
    if patterns:
        commander_patterns[commander] = patterns

print(f"✅ Analyzed patterns for {len(commander_patterns):,} commanders")

# Show example patterns
if commander_patterns:
    example_cmd = list(commander_patterns.keys())[0]
    example_patterns = commander_patterns[example_cmd]
    print(f"\nExample - {example_cmd}:")
    print(f"  Total recommendations: {example_patterns['total_recommendations']}")
    print(f"  Average synergy: {example_patterns['avg_synergy']:.2f}%")
    if example_patterns['keywords']:
        print(f"  Top keywords: {example_patterns['keywords'][:5]}")
    if example_patterns['secondary_types']:
        print(f"  Top secondary types: {example_patterns['secondary_types'][:5]}")
    if example_patterns['avg_recommended_price'] is not None:
        print(f"  Average recommended price: ${example_patterns['avg_recommended_price']:.2f}")

# =============================================================================
# 8. CREATE TRAINING FEATURES
# =============================================================================

print("\n8️⃣ CREATING TRAINING FEATURES...")

# Create feature matrix for each recommendation
feature_data = []

for idx, row in df_valid.iterrows():
    commander = row['commander_name']
    patterns = commander_patterns.get(commander, {})
    
    # Basic features
    features = {
        'commander': commander,
        'recommended_creature': row['recommended_creature_name'],
        'synergy_score': row['synergy_percentage'],
        'power': row['power_clean'],
        'toughness': row['toughness_clean'],
        'oracle_text_clean': row['oracle_text_clean'],
        'keywords': row['keywords_list'],
        'secondary_types': row['secondary_types_list']
    }
    
    # Add price features if available
    if 'commander_price_clean' in row:
        features['commander_price'] = row['commander_price_clean']
    if 'recommended_price_clean' in row:
        features['recommended_price'] = row['recommended_price_clean']
    
    feature_data.append(features)

# Create features DataFrame
features_df = pd.DataFrame(feature_data)
print(f"✅ Created features for {len(features_df):,} recommendations")

# =============================================================================
# 9. SAVE PROCESSED DATA
# =============================================================================

print("\n9️⃣ SAVING PROCESSED DATA...")

# Save the processed training data
features_df.to_csv('data/processed/training_features.csv', index=False)
print("✅ Saved training_features.csv")

# Save TF-IDF vectors (as sparse matrix)
import pickle
with open('data/processed/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
print("✅ Saved TF-IDF vectorizer")

# Save commander patterns
with open('data/processed/commander_patterns.pkl', 'wb') as f:
    pickle.dump(commander_patterns, f)
print("✅ Saved commander patterns")

# Save cleaned full creatures data for candidate generation
creatures_clean = creatures_df.copy()
creatures_clean['oracle_text_clean'] = creatures_clean['oracle_text'].apply(clean_oracle_text)
creatures_clean['power_clean'] = creatures_clean['power'].apply(clean_power_toughness)
creatures_clean['toughness_clean'] = creatures_clean['toughness'].apply(clean_power_toughness)

# Apply the parsing functions to the original columns
creatures_clean['color_identity_parsed'] = creatures_clean['color_identity'].apply(safe_parse_list)
creatures_clean['keywords_parsed'] = creatures_clean['keywords'].apply(safe_parse_list)
creatures_clean['secondary_types_parsed'] = creatures_clean['secondary_type'].apply(safe_parse_list)

# Add price cleaning for creatures
if 'usd_price' in creatures_clean.columns:
    creatures_clean['price_clean'] = creatures_clean['usd_price'].apply(clean_price)

# Debug: Check a few examples to ensure parsing is working correctly
print("\n🔍 Verifying parsing results:")
sample_idx = creatures_clean[creatures_clean['keywords'].notna()].index[:3]
for idx in sample_idx:
    print(f"\nCreature: {creatures_clean.loc[idx, 'name']}")
    print(f"  Color Identity: {creatures_clean.loc[idx, 'color_identity']} → {creatures_clean.loc[idx, 'color_identity_parsed']}")
    print(f"  Keywords: {creatures_clean.loc[idx, 'keywords']} → {creatures_clean.loc[idx, 'keywords_parsed']}")
    print(f"  Secondary Types: {creatures_clean.loc[idx, 'secondary_type']} → {creatures_clean.loc[idx, 'secondary_types_parsed']}")

creatures_clean.to_csv('data/processed/creatures_processed.csv', index=False)
print("\n✅ Saved processed creatures data")

# =============================================================================
# 10. SUMMARY AND NEXT STEPS
# =============================================================================

print("\n🔟 SUMMARY AND NEXT STEPS...")
print("=" * 60)
print("✅ COMPLETED:")
print(f"  • Cleaned and validated {len(df_valid):,} recommendations")
print(f"  • Created TF-IDF embeddings ({tfidf_matrix.shape[1]:,} features)")
print(f"  • Analyzed {len(commander_patterns):,} commander patterns")
print("  • Preprocessed oracle text, power/toughness, and prices")

print("\n📊 DATA SUMMARY:")
print(f"  • Training examples: {len(features_df):,}")
print(f"  • TF-IDF vocabulary: {len(tfidf_feature_names):,} terms")
print(f"  • Total creatures in database: {len(creatures_clean):,}")
print(f"  • Average synergy score: {features_df['synergy_score'].mean():.3f}")

if 'recommended_price' in features_df.columns:
    print(f"  • Average recommended creature price: ${features_df['recommended_price'].mean():.2f}")
    print(f"  • Median recommended creature price: ${features_df['recommended_price'].median():.2f}")

print("\n🚀 READY FOR STEP 4:")
print("  • Build TensorFlow similarity model")
print("  • Train on oracle text embeddings")
print("  • Implement power/toughness weighting")
print("  • Include price-based features")
print("  • Create recommendation pipeline")

print("\n💾 FILES CREATED:")
print("  • data/processed/training_features.csv")
print("  • data/processed/tfidf_vectorizer.pkl")
print("  • data/processed/commander_patterns.pkl")
print("  • data/processed/creatures_processed.csv")

print("\n📋 Save this notebook as '03_preprocessing.ipynb'")
print("=" * 60)

🃏 MTG Commander Recommendation System - Step 3
🔧 Data Preprocessing and Feature Engineering
✅ NLTK data downloaded

1️⃣ LOADING PREPROCESSED DATA...
✅ Loaded joined data: 83,325 rows, 17 columns
✅ Loaded full creatures data: 15,545 creatures

2️⃣ DATA CLEANING AND VALIDATION...
🧹 Cleaning data...
Removed 415 rows with missing critical data
✅ Cleaned data: 82,910 valid recommendations

3️⃣ VALIDATING COLOR IDENTITY RULES...
Color identity compliance: 82,910/82,910 (100.0%)
✅ Training data: 82,910 valid recommendations

4️⃣ PREPROCESSING ORACLE TEXT...
🧹 Cleaning oracle text...
✅ Oracle text cleaned: 82,910 recommendations with valid text

Sample cleaned oracle text:
  1. flying ward whenever another nontoken dragon you control enters create a token that s a copy of it e...
  2. flying when tiamat enters if you cast it search your library for up to five dragon cards not named t...
  3. flying trample whenever a dragon you control attacks it gains double strike until eot...

5️⃣ PREPROCES