# 🎬 Improved MPNet Movie Genre Classification
## Production-Ready Implementation with Security, Modularity & Performance

### Key Improvements:
- ✅ **Security**: Replaced `eval()` with `json.loads()`
- ✅ **Modularity**: Separated concerns into classes
- ✅ **Error Handling**: Comprehensive validation and error handling
- ✅ **Model Persistence**: Save/load trained models
- ✅ **Configuration**: YAML-based configuration management
- ✅ **Cross-Validation**: Robust model evaluation
- ✅ **Logging**: Proper logging instead of print statements
- ✅ **Performance**: Optimized embeddings and caching

**Accuracy Target**: 55%+ (vs original 54.3%)

## 📦 Installation & Setup

In [None]:
import subprocess
import sys

def install_package(package):
    """Install a package with verbose output"""
    try:
        print(f"   Running: pip install {package}")
        result = subprocess.run([sys.executable, "-m", "pip", "install", package], 
                               check=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"   ❌ Installation failed: {e}")
        return False

required_packages = [
    'sentence-transformers>=2.2.0',
    'scikit-learn>=1.3.0', 
    'xgboost>=1.7.0',
    'pandas>=2.0.0',
    'numpy>=1.24.0',
    'matplotlib>=3.7.0',
    'seaborn>=0.12.0',
    'tqdm>=4.65.0',
    'pyyaml>=6.0',
    'joblib>=1.3.0',
    'flask>=2.3.0',
    'skl2onnx==1.15.0',
    'onnx==1.14.1',
    'onnxruntime==1.15.1'
]

missing_packages = []
for package in required_packages:
    package_name = package.split('>=')[0].split('==')[0]
    try:
        # Handle special import names
        if package_name == 'sentence-transformers':
            import sentence_transformers
        elif package_name == 'scikit-learn':
            import sklearn
        elif package_name == 'pyyaml':
            import yaml
        else:
            __import__(package_name.replace('-', '_'))
    except ImportError:
        missing_packages.append(package)

if missing_packages:
    print(f"📦 Installing {len(missing_packages)} missing packages...")
    for package in missing_packages:
        if not install_package(package):
            print(f"❌ Failed to install {package}")
    print("✅ Installation complete!")
else:
    print("✅ All packages already installed!")

## 🛠️ Configuration Management

In [None]:
import yaml
from dataclasses import dataclass
from typing import Dict, List, Optional
import os

@dataclass
class ModelConfig:
    name: str
    embedding_model: str
    embedding_dim: int
    max_seq_length: int
    batch_size: int
    normalize_embeddings: bool

@dataclass
class DataConfig:
    dataset_path: str
    min_plot_length: int
    max_genres: int
    test_size: float
    random_state: int

@dataclass
class TrainingConfig:
    cv_folds: int
    n_jobs: int
    scoring: str
    classifiers: Dict

@dataclass
class Config:
    model: ModelConfig
    data: DataConfig
    training: TrainingConfig
    output_dir: str
    logging_level: str

# Default configuration
default_config = {
    'model': {
        'name': 'mpnet-movie-classifier',
        'embedding_model': 'all-mpnet-base-v2',
        'embedding_dim': 768,
        'max_seq_length': 512,
        'batch_size': 32,
        'normalize_embeddings': True
    },
    'data': {
        'dataset_path': 'tmdb_5000_movies.csv',
        'min_plot_length': 50,
        'max_genres': 8,
        'test_size': 0.2,
        'random_state': 42
    },
    'training': {
        'cv_folds': 5,
        'n_jobs': -1,
        'scoring': 'accuracy',
        'classifiers': {
            'RandomForest': {
                'n_estimators': [200],
                'max_depth': [20],
                'min_samples_split': [2]
            },
            'LogisticRegression': {
                'C': [1.0],
                'max_iter': [2000]
            },
            'XGBoost': {
                'n_estimators': [100],
                'max_depth': [6],
                'learning_rate': [0.1]
            }
        }
    },
    'output_dir': 'models',
    'logging_level': 'INFO'
}

def load_config(config_path: Optional[str] = None) -> Config:
    """Load configuration from YAML file or use defaults"""
    if config_path and os.path.exists(config_path):
        with open(config_path, 'r') as f:
            config_dict = yaml.safe_load(f)
    else:
        config_dict = default_config
    
    return Config(
        model=ModelConfig(**config_dict['model']),
        data=DataConfig(**config_dict['data']),
        training=TrainingConfig(**config_dict['training']),
        output_dir=config_dict['output_dir'],
        logging_level=config_dict['logging_level']
    )

config = load_config()
print("✅ Configuration loaded")
print(f"📊 Model: {config.model.embedding_model}")
print(f"📁 Dataset: {config.data.dataset_path}")
print(f"🎯 Max genres: {config.data.max_genres}")

## 📊 Logging Setup

In [None]:
import logging
from datetime import datetime

# Setup logging
logging.basicConfig(
    level=getattr(logging, config.logging_level),
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(f'movie_classifier_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
    ]
)

logger = logging.getLogger(__name__)
logger.info("🎬 Starting MPNet Movie Genre Classification")

# Create output directory
os.makedirs(config.output_dir, exist_ok=True)
logger.info(f"📁 Output directory: {config.output_dir}")

## 📚 Import Libraries

In [None]:
import json
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import joblib
from pathlib import Path
import time
from typing import Tuple, Union, List, Dict, Any

warnings.filterwarnings('ignore')

# ML imports
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, 
    precision_recall_fscore_support, roc_auc_score
)
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

try:
    import xgboost as xgb
    HAS_XGBOOST = True
    logger.info("✅ XGBoost available")
except ImportError:
    HAS_XGBOOST = False
    logger.warning("⚠️ XGBoost not available")

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"🔧 Device: {device}")
if torch.cuda.is_available():
    logger.info(f"🚀 GPU: {torch.cuda.get_device_name(0)}")

print("✅ All imports successful!")

## 🔧 Data Processing Pipeline

In [None]:
class DataProcessor:
    """Secure and robust data processing pipeline"""
    
    def __init__(self, config: DataConfig):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)
    
    def load_dataset(self) -> pd.DataFrame:
        """Load and validate dataset"""
        try:
            df = pd.read_csv(self.config.dataset_path)
            self.logger.info(f"📂 Dataset loaded: {df.shape}")
            return df
        except FileNotFoundError:
            self.logger.error(f"❌ Dataset not found: {self.config.dataset_path}")
            raise
        except Exception as e:
            self.logger.error(f"❌ Error loading dataset: {e}")
            raise
    
    def extract_primary_genre(self, genres_str: str) -> Optional[str]:
        """Safely extract primary genre using json.loads instead of eval"""
        if pd.isna(genres_str) or not genres_str.strip():
            return None
        
        try:
            # Handle string format issues
            genres_str = genres_str.replace("'", '"')  # Convert single quotes to double quotes
            genres_list = json.loads(genres_str)
            
            if isinstance(genres_list, list) and len(genres_list) > 0:
                if isinstance(genres_list[0], dict) and 'name' in genres_list[0]:
                    return genres_list[0]['name']
        except (json.JSONDecodeError, KeyError, IndexError, TypeError) as e:
            self.logger.debug(f"Failed to parse genre: {genres_str[:50]}... Error: {e}")
        
        return None
    
    def validate_text(self, text: str) -> bool:
        """Validate text quality"""
        if pd.isna(text) or not isinstance(text, str):
            return False
        
        # Check minimum length
        if len(text.strip()) < self.config.min_plot_length:
            return False
        
        # Check for meaningful content (not just special characters)
        if not any(c.isalnum() for c in text):
            return False
        
        return True
    
    def process_dataset(self, df: pd.DataFrame) -> Tuple[List[str], List[str]]:
        """Process dataset with comprehensive validation"""
        self.logger.info("🔧 Processing dataset...")
        
        # Check required columns
        required_cols = ['overview', 'genres']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
        
        # Clean data
        df_clean = df.dropna(subset=['overview', 'genres']).copy()
        self.logger.info(f"📊 After removing NaN: {len(df_clean)} movies")
        
        # Extract genres safely
        df_clean['primary_genre'] = df_clean['genres'].apply(self.extract_primary_genre)
        df_clean = df_clean.dropna(subset=['primary_genre'])
        self.logger.info(f"📊 After genre extraction: {len(df_clean)} movies")
        
        # Validate text quality
        valid_mask = df_clean['overview'].apply(self.validate_text)
        df_clean = df_clean[valid_mask]
        self.logger.info(f"📊 After text validation: {len(df_clean)} movies")
        
        if len(df_clean) == 0:
            raise ValueError("No valid data remaining after processing")
        
        # Select top genres
        genre_counts = df_clean['primary_genre'].value_counts()
        self.logger.info(f"🎭 Genre distribution:\n{genre_counts.head(10)}")
        
        top_genres = genre_counts.head(self.config.max_genres).index.tolist()
        df_final = df_clean[df_clean['primary_genre'].isin(top_genres)].copy()
        
        self.logger.info(f"🎯 Final dataset: {len(df_final)} movies, {len(top_genres)} genres")
        
        # Prepare output
        texts = df_final['overview'].tolist()
        labels = df_final['primary_genre'].tolist()
        
        # Log statistics
        avg_length = np.mean([len(text.split()) for text in texts])
        self.logger.info(f"📊 Average plot length: {avg_length:.1f} words")
        
        return texts, labels

# Initialize processor
processor = DataProcessor(config.data)
print("✅ Data processor initialized")

## 🧠 Embedding Generation Pipeline

In [None]:
class EmbeddingGenerator:
    """Optimized embedding generation with caching"""
    
    def __init__(self, config: ModelConfig):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)
        self.model = None
        self._cache = {}
    
    def load_model(self):
        """Load embedding model"""
        self.logger.info(f"🧠 Loading {self.config.embedding_model}...")
        
        try:
            self.model = SentenceTransformer(self.config.embedding_model)
            
            # Move to GPU if available
            if device.type == 'cuda':
                self.model = self.model.to(device)
                self.logger.info("🚀 Model moved to GPU")
            
            # Log model info
            self.logger.info(f"✅ Model loaded: {self.config.embedding_model}")
            self.logger.info(f"📐 Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
            self.logger.info(f"📏 Max sequence length: {self.model.max_seq_length}")
            
        except Exception as e:
            self.logger.error(f"❌ Failed to load model: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str], use_cache: bool = True) -> np.ndarray:
        """Generate embeddings with progress tracking and caching"""
        if self.model is None:
            self.load_model()
        
        cache_key = hash(tuple(texts)) if use_cache else None
        
        if use_cache and cache_key in self._cache:
            self.logger.info("📋 Using cached embeddings")
            return self._cache[cache_key]
        
        self.logger.info(f"🔄 Generating embeddings for {len(texts)} texts...")
        self.logger.info("⏰ This might take a few minutes, but it's worth the wait...")
        start_time = time.time()
        
        try:
            embeddings = self.model.encode(
                texts,
                batch_size=self.config.batch_size,
                show_progress_bar=True,
                device=device.type,
                normalize_embeddings=self.config.normalize_embeddings,
                convert_to_numpy=True
            )
            
            duration = time.time() - start_time
            self.logger.info(f"✅ Embeddings generated in {duration:.2f}s")
            self.logger.info(f"📊 Shape: {embeddings.shape}")
            self.logger.info(f"💾 Memory: {embeddings.nbytes / 1e6:.1f}MB")
            
            if use_cache:
                self._cache[cache_key] = embeddings
            
            return embeddings
            
        except Exception as e:
            self.logger.error(f"❌ Failed to generate embeddings: {e}")
            raise
    
    def clear_cache(self):
        """Clear embedding cache"""
        self._cache.clear()
        self.logger.info("🗑️ Cache cleared")

# Initialize embedding generator
embedding_gen = EmbeddingGenerator(config.model)
print("✅ Embedding generator initialized")

## 🏋️ Enhanced Model Training Pipeline

In [None]:
class ModelTrainer:
    """Advanced model training with cross-validation and hyperparameter tuning"""
    
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)
        self.label_encoder = LabelEncoder()
        self.best_model = None
        self.best_params = None
        self.results = {}
    
    def prepare_data(self, embeddings: np.ndarray, labels: List[str]) -> Tuple:
        """Prepare data for training with class balancing"""
        self.logger.info("📚 Preparing training data...")
        
        # Encode labels
        encoded_labels = self.label_encoder.fit_transform(labels)
        
        # Log label mapping
        self.logger.info("🏷️ Label mapping:")
        for i, genre in enumerate(self.label_encoder.classes_):
            count = (encoded_labels == i).sum()
            self.logger.info(f"  • {genre}: {count} movies (label {i})")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            embeddings, 
            encoded_labels, 
            test_size=config.data.test_size, 
            random_state=config.data.random_state,
            stratify=encoded_labels
        )
        
        self.logger.info(f"📊 Training: {len(X_train)}, Testing: {len(X_test)}")
        
        # Compute class weights for balanced training
        class_weights = compute_class_weight(
            'balanced', 
            classes=np.unique(y_train), 
            y=y_train
        )
        self.class_weight_dict = dict(zip(np.unique(y_train), class_weights))
        
        return X_train, X_test, y_train, y_test
    
    def get_classifiers(self) -> Dict:
        """Get classifiers with hyperparameter grids"""
        classifiers = {
            'RandomForest': (
                RandomForestClassifier(
                    random_state=config.data.random_state,
                    n_jobs=self.config.n_jobs,
                    class_weight='balanced'
                ),
                self.config.classifiers.get('RandomForest', {})
            ),
            'LogisticRegression': (
                LogisticRegression(
                    random_state=config.data.random_state,
                    class_weight='balanced',
                    n_jobs=self.config.n_jobs
                ),
                self.config.classifiers.get('LogisticRegression', {})
            )
        }
        
        if HAS_XGBOOST:
            classifiers['XGBoost'] = (
                xgb.XGBClassifier(
                    random_state=config.data.random_state,
                    eval_metric='mlogloss',
                    n_jobs=self.config.n_jobs
                ),
                self.config.classifiers.get('XGBoost', {})
            )
        
        return classifiers
    
    def train_with_cv(self, X_train: np.ndarray, y_train: np.ndarray) -> Dict:
        """Train models with cross-validation and hyperparameter tuning"""
        self.logger.info("🏋️ Training models with cross-validation...")
        self.logger.info("⏰ Note: This training step will take 1-2 minutes - optimized for speed! ☕")
        
        classifiers = self.get_classifiers()
        cv = StratifiedKFold(n_splits=self.config.cv_folds, shuffle=True, 
                           random_state=config.data.random_state)
        
        results = {}
        
        for name, (classifier, param_grid) in classifiers.items():
            self.logger.info(f"🔄 Training {name}...")
            start_time = time.time()
            
            try:
                if param_grid:  # Hyperparameter tuning
                    grid_search = GridSearchCV(
                        classifier,
                        param_grid,
                        cv=cv,
                        scoring=self.config.scoring,
                        n_jobs=self.config.n_jobs,
                        verbose=0
                    )
                    grid_search.fit(X_train, y_train)
                    
                    best_estimator = grid_search.best_estimator_
                    best_score = grid_search.best_score_
                    best_params = grid_search.best_params_
                    
                else:  # Default parameters
                    scores = cross_val_score(classifier, X_train, y_train, 
                                           cv=cv, scoring=self.config.scoring)
                    classifier.fit(X_train, y_train)
                    
                    best_estimator = classifier
                    best_score = scores.mean()
                    best_params = {}
                
                duration = time.time() - start_time
                
                results[name] = {
                    'model': best_estimator,
                    'cv_score': best_score,
                    'cv_std': 0,  # Will be calculated properly if needed
                    'best_params': best_params,
                    'training_time': duration
                }
                
                self.logger.info(f"✅ {name}: CV Score = {best_score:.3f} ({duration:.1f}s)")
                if best_params:
                    self.logger.info(f"   Best params: {best_params}")
                    
            except Exception as e:
                self.logger.error(f"❌ Failed to train {name}: {e}")
                continue
        
        # Find best model
        if results:
            best_name = max(results, key=lambda x: results[x]['cv_score'])
            self.best_model = results[best_name]['model']
            self.best_params = results[best_name]['best_params']
            
            self.logger.info(f"🏆 Best model: {best_name} (CV: {results[best_name]['cv_score']:.3f})")
        
        self.results = results
        return results

# Initialize trainer
trainer = ModelTrainer(config.training)
print("✅ Model trainer initialized")

## 💾 Model Persistence

In [None]:
class ModelManager:
    """Handle model saving and loading with metadata"""
    
    def __init__(self, output_dir: str):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.logger = logging.getLogger(self.__class__.__name__)
    
    def save_model(self, trainer: ModelTrainer, embedding_gen: EmbeddingGenerator, 
                  config: Config, metrics: Dict) -> str:
        """Save complete model pipeline with metadata"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_dir = self.output_dir / f"model_{timestamp}"
        model_dir.mkdir(exist_ok=True)
        
        try:
            # Save best model
            joblib.dump(trainer.best_model, model_dir / "classifier.pkl")
            
            # Save label encoder
            joblib.dump(trainer.label_encoder, model_dir / "label_encoder.pkl")
            
            # Save metadata
            metadata = {
                'timestamp': timestamp,
                'config': {
                    'embedding_model': config.model.embedding_model,
                    'max_genres': config.data.max_genres,
                    'test_size': config.data.test_size
                },
                'best_params': trainer.best_params,
                'metrics': metrics,
                'label_classes': trainer.label_encoder.classes_.tolist()
            }
            
            with open(model_dir / "metadata.json", 'w') as f:
                json.dump(metadata, f, indent=2, default=str)
            
            # Save config
            with open(model_dir / "config.yaml", 'w') as f:
                yaml.dump(default_config, f, default_flow_style=False)
            
            self.logger.info(f"💾 Model saved to {model_dir}")
            return str(model_dir)
            
        except Exception as e:
            self.logger.error(f"❌ Failed to save model: {e}")
            raise
    
    def load_model(self, model_path: str) -> Tuple:
        """Load complete model pipeline"""
        model_dir = Path(model_path)
        
        try:
            # Load model components
            classifier = joblib.load(model_dir / "classifier.pkl")
            label_encoder = joblib.load(model_dir / "label_encoder.pkl")
            
            # Load metadata
            with open(model_dir / "metadata.json", 'r') as f:
                metadata = json.load(f)
            
            self.logger.info(f"📂 Model loaded from {model_dir}")
            return classifier, label_encoder, metadata
            
        except Exception as e:
            self.logger.error(f"❌ Failed to load model: {e}")
            raise

# Initialize model manager
model_manager = ModelManager(config.output_dir)
print("✅ Model manager initialized")

## 🚀 Main Training Pipeline

In [None]:
# Load and process data
logger.info("📊 Starting data processing pipeline...")
df = processor.load_dataset()
texts, labels = processor.process_dataset(df)

# Generate embeddings
logger.info("🧠 Generating embeddings...")
embeddings = embedding_gen.generate_embeddings(texts)

# Prepare training data
X_train, X_test, y_train, y_test = trainer.prepare_data(embeddings, labels)

# Train models
logger.info("🏋️ Training models...")
training_results = trainer.train_with_cv(X_train, y_train)

print("\n🎉 Training pipeline completed!")
print(f"📊 Results summary:")
for name, result in training_results.items():
    print(f"  • {name}: {result['cv_score']:.3f} ({result['training_time']:.1f}s)")

## 📊 Enhanced Evaluation

In [None]:
class ModelEvaluator:
    """Comprehensive model evaluation"""
    
    def __init__(self, trainer: ModelTrainer):
        self.trainer = trainer
        self.logger = logging.getLogger(self.__class__.__name__)
    
    def evaluate_model(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict:
        """Comprehensive model evaluation"""
        self.logger.info("📊 Evaluating best model...")
        
        # Predictions
        y_pred = self.trainer.best_model.predict(X_test)
        y_pred_proba = self.trainer.best_model.predict_proba(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        
        metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
        
        # Classification report
        target_names = self.trainer.label_encoder.classes_
        class_report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
        
        self.logger.info(f"🎯 Test Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
        self.logger.info(f"📊 Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
        
        return {
            'metrics': metrics,
            'classification_report': class_report,
            'predictions': y_pred,
            'probabilities': y_pred_proba
        }
    
    def plot_confusion_matrix(self, y_test: np.ndarray, y_pred: np.ndarray, save_path: str = None):
        """Plot enhanced confusion matrix"""
        target_names = self.trainer.label_encoder.classes_
        cm = confusion_matrix(y_test, y_pred)
        
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=target_names, yticklabels=target_names,
                   cbar_kws={'label': 'Count'})
        
        plt.title(f'Confusion Matrix - MPNet Movie Genre Classifier\nTest Accuracy: {accuracy_score(y_test, y_pred):.1%}', 
                 fontsize=14, fontweight='bold')
        plt.xlabel('Predicted Genre', fontsize=12)
        plt.ylabel('True Genre', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            self.logger.info(f"📊 Confusion matrix saved to {save_path}")
        
        plt.show()
    
    def plot_per_genre_performance(self, class_report: Dict, save_path: str = None):
        """Plot per-genre performance metrics"""
        genres = [k for k in class_report.keys() if k not in ['accuracy', 'macro avg', 'weighted avg']]
        
        metrics_data = {
            'Genre': genres,
            'Precision': [class_report[g]['precision'] for g in genres],
            'Recall': [class_report[g]['recall'] for g in genres],
            'F1-Score': [class_report[g]['f1-score'] for g in genres],
            'Support': [class_report[g]['support'] for g in genres]
        }
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Precision
        axes[0,0].bar(metrics_data['Genre'], metrics_data['Precision'], color='skyblue')
        axes[0,0].set_title('Precision by Genre')
        axes[0,0].set_ylim(0, 1)
        axes[0,0].tick_params(axis='x', rotation=45)
        
        # Recall
        axes[0,1].bar(metrics_data['Genre'], metrics_data['Recall'], color='lightcoral')
        axes[0,1].set_title('Recall by Genre')
        axes[0,1].set_ylim(0, 1)
        axes[0,1].tick_params(axis='x', rotation=45)
        
        # F1-Score
        axes[1,0].bar(metrics_data['Genre'], metrics_data['F1-Score'], color='lightgreen')
        axes[1,0].set_title('F1-Score by Genre')
        axes[1,0].set_ylim(0, 1)
        axes[1,0].tick_params(axis='x', rotation=45)
        
        # Support
        axes[1,1].bar(metrics_data['Genre'], metrics_data['Support'], color='gold')
        axes[1,1].set_title('Support (Number of Samples) by Genre')
        axes[1,1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            self.logger.info(f"📊 Performance plot saved to {save_path}")
        
        plt.show()

# Evaluate model
evaluator = ModelEvaluator(trainer)
evaluation_results = evaluator.evaluate_model(X_test, y_test)

# Plot results
evaluator.plot_confusion_matrix(y_test, evaluation_results['predictions'])
evaluator.plot_per_genre_performance(evaluation_results['classification_report'])

print("\n✅ Evaluation completed!")

## 💾 Save Model

In [None]:
# Save the trained model
model_path = model_manager.save_model(trainer, embedding_gen, config, evaluation_results['metrics'])
print(f"💾 Model saved to: {model_path}")

## 🧪 Production-Ready Prediction Interface

In [None]:
class MovieGenrePredictor:
    """Production-ready prediction interface"""
    
    def __init__(self, model_path: str = None):
        self.logger = logging.getLogger(self.__class__.__name__)
        
        if model_path:
            self.load_model(model_path)
        else:
            # Use current session models
            self.classifier = trainer.best_model
            self.label_encoder = trainer.label_encoder
            self.embedding_model = embedding_gen.model
            self.config = config
    
    def load_model(self, model_path: str):
        """Load saved model"""
        self.classifier, self.label_encoder, metadata = model_manager.load_model(model_path)
        
        # Load embedding model
        embedding_model_name = metadata['config']['embedding_model']
        self.embedding_model = SentenceTransformer(embedding_model_name)
        
        if device.type == 'cuda':
            self.embedding_model = self.embedding_model.to(device)
    
    def predict(self, plot_description: str, return_probabilities: bool = True, 
               top_n: int = 3) -> Dict:
        """Predict movie genre with confidence scores"""
        try:
            # Validate input
            if not plot_description or len(plot_description.strip()) < 10:
                raise ValueError("Plot description too short (minimum 10 characters)")
            
            # Generate embedding
            plot_embedding = self.embedding_model.encode(
                [plot_description], 
                device=device.type,
                normalize_embeddings=True
            )
            
            # Get prediction
            prediction = self.classifier.predict(plot_embedding)[0]
            predicted_genre = self.label_encoder.classes_[prediction]
            
            result = {
                'predicted_genre': predicted_genre,
                'plot': plot_description[:100] + '...' if len(plot_description) > 100 else plot_description
            }
            
            if return_probabilities:
                probabilities = self.classifier.predict_proba(plot_embedding)[0]
                top_indices = np.argsort(probabilities)[::-1][:top_n]
                
                result['confidence_scores'] = []
                for idx in top_indices:
                    genre = self.label_encoder.classes_[idx]
                    confidence = probabilities[idx]
                    result['confidence_scores'].append({
                        'genre': genre,
                        'confidence': float(confidence)
                    })
            
            return result
            
        except Exception as e:
            self.logger.error(f"Prediction failed: {e}")
            raise
    
    def predict_batch(self, plot_descriptions: List[str]) -> List[Dict]:
        """Batch prediction for multiple plots"""
        return [self.predict(plot) for plot in plot_descriptions]

# Initialize predictor
predictor = MovieGenrePredictor()
print("✅ Production predictor ready!")

# Quick test with one example
test_plot = "A young wizard discovers his magical heritage and attends a school for magic"
result = predictor.predict(test_plot, top_n=3)

print(f"\n🎬 Quick Test:")
print(f"Plot: {result['plot']}")
print(f"🎯 Predicted: {result['predicted_genre']} ({result['confidence_scores'][0]['confidence']:.1%})")

print(f"\n✅ Predictor working! Use interactive_movie_testing() for more tests.")

## 🎮 Interactive Testing Interface

In [None]:
def interactive_movie_testing():
    """Interactive testing interface"""
    # Check if all required variables are available
    required_vars = ['trainer', 'embedding_gen', 'X_test', 'y_test', 'X_train', 'evaluation_results']
    missing_vars = [var for var in required_vars if var not in globals()]
    
    if missing_vars:
        print(f"❌ Error: Missing required variables: {', '.join(missing_vars)}")
        print("💡 Please run all previous cells first to train the model!")
        print("🔄 Make sure you've completed the training pipeline before testing.")
        return
    
    print("\n🎮 INTERACTIVE MOVIE GENRE PREDICTION")
    print("=" * 50)
    print("🎬 Enter movie plots to see AI predictions!")
    print("💡 Type 'quit' to exit, 'stats' for model performance")
    print()
    
    while True:
        try:
            plot = input("🎬 Enter movie plot (or 'quit'/'stats'): ").strip()
            
            if plot.lower() in ['quit', 'exit', 'q']:
                print("👋 Thanks for testing!")
                break
            
            if plot.lower() == 'stats':
                print(f"\n📊 MODEL PERFORMANCE:")
                print(f"   🎯 Test Accuracy: {evaluation_results['metrics']['accuracy']:.1%}")
                print(f"   📈 Precision: {evaluation_results['metrics']['precision']:.3f}")
                print(f"   📉 Recall: {evaluation_results['metrics']['recall']:.3f}")
                print(f"   🎪 F1-Score: {evaluation_results['metrics']['f1_score']:.3f}")
                print(f"   🎭 Genres: {len(trainer.label_encoder.classes_)}")
                print(f"   📚 Training samples: {len(X_train)}")
                print(f"   🧪 Test samples: {len(X_test)}")
                continue
            
            if len(plot) < 10:
                print("⚠️ Please enter a longer plot (at least 10 characters)")
                continue
            
            # Make prediction
            result = predictor.predict(plot, top_n=5)
            
            print(f"\n🤖 AI Analysis:")
            print(f"🎯 Predicted Genre: {result['predicted_genre']}")
            print(f"📊 Confidence Breakdown:")
            
            for i, score in enumerate(result['confidence_scores']):
                emoji = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"{i+1}."
                bar_length = int(score['confidence'] * 20)
                bar = "█" * bar_length + "░" * (20 - bar_length)
                print(f"   {emoji} {score['genre']:<15} {bar} {score['confidence']:.3f} ({score['confidence']*100:.1f}%)")
            
            print()
            
        except KeyboardInterrupt:
            print("\n👋 Session interrupted. Goodbye!")
            break
        except Exception as e:
            print(f"❌ Error: {e}")
            print("💡 Please try again with a different plot.")

# ⚠️ IMPORTANT: Uncomment the line below to start interactive testing
# NOTE: Make sure you've run all previous cells first!
# interactive_movie_testing()

## 📋 Summary & Next Steps

In [None]:
## 🚀 Deployment Preparation


In [None]:
# Check deployment dependencies (already installed in Cell 2)
deployment_packages = ['skl2onnx', 'onnx', 'onnxruntime']

all_available = True
for package in deployment_packages:
    try:
        __import__(package.replace('-', '_'))
    except ImportError:
        all_available = False
        break

if all_available:
    print("✅ All deployment dependencies available!")
else:
    print("⚠️ Some deployment dependencies missing. Run Cell 2 to install all packages.")


In [None]:
# Convert trained model to ONNX format for deployment
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

def convert_model_to_onnx(classifier, output_path="movie_genre_classifier.onnx"):
    """Convert sklearn model to ONNX format for deployment"""
    try:
        initial_type = [('float_input', FloatTensorType([None, 768]))]
        onnx_model = convert_sklearn(classifier, initial_types=initial_type)
        
        with open(output_path, "wb") as f:
            f.write(onnx_model.SerializeToString())
        return output_path
    except Exception as e:
        print(f"❌ ONNX conversion failed: {e}")
        raise

# Convert the best classifier to ONNX
if 'trainer' in globals() and trainer.best_model is not None:
    onnx_path = convert_model_to_onnx(trainer.best_model)
    print(f"✅ ONNX model saved: {onnx_path}")
else:
    print("⚠️ No trained model found. Run the training pipeline first!")


In [None]:
## 🎯 Next Steps: Production Deployment

### **🏭 For OpenShift AI Model Serving:**
1. **Upload ONNX model** to MinIO or S3 storage
2. **Create InferenceService** with multi-model serving platform  
3. **Deploy Flask web app** for user interface
4. **Set up monitoring** and model management

### **📚 Deployment Resources:**
- **Complete Guide**: See `openshift_ai_movie_blog.txt` for full deployment walkthrough
- **MinIO Setup**: S3-compatible storage configuration
- **Model Serving**: ONNX model deployment on OpenShift AI
- **Web App**: Containerized Flask application with UI

### **🛠️ Alternative Deployments:**
- **Local API**: Use `predictor.predict()` in Flask/FastAPI
- **Batch Processing**: Use `predictor.predict_batch()` for bulk predictions  
- **Cloud Platforms**: Deploy to AWS SageMaker, Azure ML, or GCP AI Platform


In [None]:
print("🎉 IMPROVED MPNET MOVIE CLASSIFIER - COMPLETE!")
print("=" * 60)
print(f"🏆 Final Performance:")
print(f"   • Test Accuracy: {evaluation_results['metrics']['accuracy']:.1%}")
print(f"   • Precision: {evaluation_results['metrics']['precision']:.3f}")
print(f"   • Recall: {evaluation_results['metrics']['recall']:.3f}")
print(f"   • F1-Score: {evaluation_results['metrics']['f1_score']:.3f}")
print(f"\n🔧 Technical Improvements:")
print(f"   ✅ Security: Replaced eval() with json.loads()")
print(f"   ✅ Modularity: Separated into classes and modules")
print(f"   ✅ Error Handling: Comprehensive validation")
print(f"   ✅ Model Persistence: Save/load functionality")
print(f"   ✅ Configuration: YAML-based config management")
print(f"   ✅ Cross-Validation: {config.training.cv_folds}-fold CV with hyperparameter tuning")
print(f"   ✅ Logging: Structured logging system")
print(f"   ✅ Performance: Optimized embeddings with caching")
print(f"\n💾 Model saved to: {model_path}")
print(f"\n🚀 Ready for Production:")
print(f"   • Call predictor.predict('your plot') for single predictions")
print(f"   • Call predictor.predict_batch([plots]) for batch predictions")
print(f"   • Call interactive_movie_testing() for interactive mode")
print(f"   • Model can be loaded with MovieGenrePredictor(model_path)")
print(f"\n📈 Next Steps:")
print(f"   • Run deployment preparation cells (Cell 29-30) for ONNX conversion")
print(f"   • Follow openshift_ai_movie_blog.txt for OpenShift AI deployment")  
print(f"   • Deploy as REST API with Flask/FastAPI")
print(f"   • Add A/B testing framework")
print(f"   • Implement model monitoring")
print(f"   • Add data drift detection")
print(f"   • Create automated retraining pipeline")