# 🤖 SpaceX Launch Success Prediction - Model Training

## Objectives:
1. Prepare data for modeling
2. Train baseline models
3. Evaluate and compare models
4. Perform hyperparameter tuning
5. Analyze feature importance
6. Save the best model

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from data_preprocessing import prepare_for_modeling, get_feature_target_split
from model_training import *

print("✓ Libraries imported successfully")

## 1. Load Cleaned Data

In [None]:
# Load the cleaned data
df = pd.read_csv('../data/spacex_cleaned.csv')
print(f"Dataset shape: {df.shape}")
print(f"Success rate: {df['Mission_Success'].mean():.2%}")
df.head()

## 2. Prepare Data for Modeling

In [None]:
# Prepare features
df_model = prepare_for_modeling(df)
print(f"Model dataset shape: {df_model.shape}")
print(f"\nFeatures: {df_model.drop(columns='Mission_Success').columns.tolist()}")

In [None]:
# Split features and target
X, y = get_feature_target_split(df_model)
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nClass distribution:\n{y.value_counts()}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTrain success rate: {y_train.mean():.2%}")
print(f"Test success rate: {y_test.mean():.2%}")

## 3. Train Baseline Models

In [None]:
# Train baseline models
print("Training baseline models...\n")
models = train_baseline_models(X_train, y_train)
print("\n✓ All models trained successfully")

## 4. Evaluate Models

In [None]:
# Evaluate all models
results = evaluate_models(models, X_test, y_test)
print("\nModel Performance Comparison:")
print("=" * 80)
print(results.to_string(index=False))
print("=" * 80)

In [None]:
# Visualize model comparison
fig, ax = plt.subplots(figsize=(12, 6))
results_melted = results.melt(id_vars='Model', var_name='Metric', value_name='Score')
sns.barplot(data=results_melted, x='Metric', y='Score', hue='Model', ax=ax)
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_ylim(0, 1.1)
ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Plot ROC curves
fig = plot_roc_curve(models, X_test, y_test)
plt.show()

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (name, model) in enumerate(models.items()):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Failure', 'Success'],
                yticklabels=['Failure', 'Success'])
    axes[idx].set_title(f'{name}\nConfusion Matrix')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## 5. Cross-Validation

In [None]:
# Perform cross-validation for each model
print("Performing 5-Fold Cross-Validation...\n")
cv_results = {}

for name, model in models.items():
    cv_result = cross_validate_model(model, X, y, cv=5)
    cv_results[name] = cv_result
    print(f"{name}:")
    print(f"  Mean Accuracy: {cv_result['mean_accuracy']:.4f} (+/- {cv_result['std_accuracy']:.4f})")
    print(f"  Individual Folds: {cv_result['scores']}")
    print()

## 6. Hyperparameter Tuning

In [None]:
# Tune Random Forest
print("Tuning Random Forest...")
rf_best, rf_params = tune_random_forest(X_train, y_train)
print(f"\nBest parameters: {rf_params}")

In [None]:
# Tune XGBoost
print("Tuning XGBoost...")
xgb_best, xgb_params = tune_xgboost(X_train, y_train)
print(f"\nBest parameters: {xgb_params}")

In [None]:
# Evaluate tuned models
tuned_models = {
    'Random Forest (Tuned)': rf_best,
    'XGBoost (Tuned)': xgb_best
}

tuned_results = evaluate_models(tuned_models, X_test, y_test)
print("\nTuned Model Performance:")
print("=" * 80)
print(tuned_results.to_string(index=False))
print("=" * 80)

## 7. Feature Importance Analysis

In [None]:
# Feature importance for Random Forest
fig = plot_feature_importance(rf_best, X.columns, top_n=15)
if fig:
    plt.title('Random Forest - Feature Importance')
    plt.show()

In [None]:
# Feature importance for XGBoost
fig = plot_feature_importance(xgb_best, X.columns, top_n=15)
if fig:
    plt.title('XGBoost - Feature Importance')
    plt.show()

## 8. Select Best Model and Save

In [None]:
# Determine best model based on ROC-AUC
all_results = pd.concat([results, tuned_results], ignore_index=True)
best_model_name = all_results.loc[all_results['ROC-AUC'].idxmax(), 'Model']
print(f"Best Model: {best_model_name}")
print(f"ROC-AUC: {all_results['ROC-AUC'].max():.4f}")

In [None]:
# Select the best model
if 'Tuned' in best_model_name:
    best_model = tuned_models[best_model_name]
else:
    best_model = models[best_model_name]

# Get comprehensive report
report = get_model_report(best_model, X_test, y_test)
print("\nBest Model Classification Report:")
print("=" * 80)
print(report['classification_report'])
print("=" * 80)

In [None]:
# Save the best model
import joblib

model_path = '../models/best_model.pkl'
save_model(best_model, model_path)

# Save feature names
feature_names = X.columns.tolist()
joblib.dump(feature_names, '../models/feature_names.pkl')
print(f"Feature names saved to ../models/feature_names.pkl")

# Save model metadata
metadata = {
    'model_name': best_model_name,
    'accuracy': report['accuracy'],
    'precision': report['precision'],
    'recall': report['recall'],
    'f1': report['f1'],
    'roc_auc': report.get('roc_auc', None),
    'features': feature_names
}
joblib.dump(metadata, '../models/model_metadata.pkl')
print(f"Model metadata saved to ../models/model_metadata.pkl")

## 9. Model Summary

### Best Model Performance:
- The model has been trained and evaluated successfully
- Feature importance has been analyzed
- The model is saved and ready for deployment