# 🤖 مدل‌سازی ماشین لرنینگ - خانه‌های بوستون

این نوت‌بوک شامل آموزش و ارزیابی مدل‌های مختلف ماشین لرنینگ برای پیش‌بینی قیمت خانه‌ها است.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import sys

# Add src to path
sys.path.append(str(Path.cwd().parent / 'src'))

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

## 📥 بارگذاری داده‌های پیش‌پردازش شده

In [None]:
# Load preprocessed data
import joblib
from data_loader import BostonHousingDataLoader
from preprocessing import DataPreprocessor
from models import ModelTrainer

# Load data
loader = BostonHousingDataLoader()
features, target, feature_names = loader.load_data()

# Preprocess data
preprocessor = DataPreprocessor(scaler_type='standard')
features_clean = preprocessor.handle_outliers(features, strategy='clip')
X_train, X_test, y_train, y_test = preprocessor.split_data(features_clean, target)
X_train_scaled, X_test_scaled = preprocessor.scale_features(X_train, X_test)

print(f"📊 Data loaded and preprocessed successfully!")
print(f"Training set: {X_train_scaled.shape}")
print(f"Testing set: {X_test_scaled.shape}")
print(f"Features: {len(feature_names)}")

## 🤖 معرفی مدل‌های ماشین لرنینگ

In [None]:
# Initialize model trainer
trainer = ModelTrainer()

# Get available models
available_models = trainer.get_models()
print("🤖 Available Machine Learning Models:")
for i, (name, model) in enumerate(available_models.items(), 1):
    print(f"{i:2d}. {name:<20} - {type(model).__name__}")

print(f"\n📊 Total models available: {len(available_models)}")

## 🚀 آموزش مدل‌ها

In [None]:
# Train all models
print("🚀 Training all models...")
print("=" * 50)

results = trainer.train_models(X_train_scaled, y_train, X_test_scaled, y_test)

print(f"\n✅ Training completed!")
print(f"Models trained: {len(results)}")
print(f"Best model: {trainer.best_model.__class__.__name__ if trainer.best_model else 'None'}")
print(f"Best R² score: {trainer.best_score:.4f}")

## 📊 خلاصه عملکرد مدل‌ها

In [None]:
# Create performance summary
performance_summary = []
for model_name, result in results.items():
    performance_summary.append({
        'Model': model_name,
        'R²': result['r2'],
        'RMSE': result['rmse'],
        'MAE': result['mae'],
        'CV_R²_Mean': result['cv_mean'],
        'CV_R²_Std': result['cv_std']
    })

performance_df = pd.DataFrame(performance_summary)
performance_df = performance_df.sort_values('R²', ascending=False)

print("📊 Model Performance Summary (Sorted by R²):")
display(performance_df.round(4))

## 📈 نمودار مقایسه مدل‌ها

In [None]:
# Plot model comparison
trainer.plot_model_comparison()

## 🏆 تحلیل بهترین مدل

In [None]:
# Get best model information
best_model_info = trainer.get_best_model_info()

if best_model_info:
    print("🏆 Best Model Analysis:")
    print("=" * 40)
    print(f"Model Name: {best_model_info['name']}")
    print(f"R² Score: {best_model_info['r2_score']:.4f}")
    print(f"RMSE: {best_model_info['metrics']['rmse']:.4f}")
    print(f"MAE: {best_model_info['metrics']['mae']:.4f}")
    print(f"CV R² Mean: {best_model_info['metrics']['cv_mean']:.4f}")
    print(f"CV R² Std: {best_model_info['metrics']['cv_std']:.4f}")
    
    # Show model type
    model_type = type(best_model_info['model']).__name__
    print(f"Model Type: {model_type}")
    
    # Performance category
    r2_score = best_model_info['r2_score']
    if r2_score >= 0.8:
        performance = "Excellent 🎯"
    elif r2_score >= 0.6:
        performance = "Good 👍"
    elif r2_score >= 0.4:
        performance = "Fair ⚠️"
    else:
        performance = "Poor ❌"
    
    print(f"Performance Category: {performance}")
else:
    print("❌ No best model found!")

## 🔍 تحلیل اهمیت ویژگی‌ها

In [None]:
# Get feature importance for tree-based models
tree_models = ['Random Forest', 'XGBoost', 'Gradient Boosting', 'Decision Tree']
feature_importance_results = {}

for model_name in tree_models:
    if model_name in trainer.models:
        print(f"\n🔍 Getting feature importance for {model_name}...")
        importance = trainer.get_feature_importance(model_name, feature_names)
        if importance:
            feature_importance_results[model_name] = importance
            print(f"✅ Feature importance obtained for {model_name}")
            
            # Show top 5 features
            print(f"Top 5 features for {model_name}:")
            for i, (feature, score) in enumerate(list(importance.items())[:5], 1):
                print(f"  {i}. {feature}: {score:.4f}")
        else:
            print(f"❌ Could not get feature importance for {model_name}")
    else:
        print(f"⚠️ {model_name} not found in trained models")

In [None]:
# Plot feature importance for best tree-based model
if feature_importance_results:
    # Find the best tree-based model
    best_tree_model = None
    best_tree_score = -1
    
    for model_name in tree_models:
        if model_name in results and model_name in feature_importance_results:
            if results[model_name]['r2'] > best_tree_score:
                best_tree_score = results[model_name]['r2']
                best_tree_model = model_name
    
    if best_tree_model:
        print(f"\n📊 Plotting feature importance for best tree-based model: {best_tree_model}")
        trainer.plot_feature_importance(best_tree_model, top_n=10)
    else:
        print("❌ No tree-based model with feature importance found!")
else:
    print("❌ No feature importance results available!")

## ⚙️ بهینه‌سازی هیپرپارامترها

In [None]:
# Hyperparameter tuning for Random Forest
print("⚙️ Performing hyperparameter tuning for Random Forest...")
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_tuning_result = trainer.hyperparameter_tuning(
    X_train_scaled, y_train, 'Random Forest', rf_param_grid
)

if rf_tuning_result:
    print(f"✅ Random Forest tuning completed!")
    print(f"Best parameters: {rf_tuning_result['best_params']}")
    print(f"Best CV score: {rf_tuning_result['best_score']:.4f}")
else:
    print("❌ Random Forest tuning failed!")

In [None]:
# Hyperparameter tuning for XGBoost
print("\n⚙️ Performing hyperparameter tuning for XGBoost...")
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0]
}

xgb_tuning_result = trainer.hyperparameter_tuning(
    X_train_scaled, y_train, 'XGBoost', xgb_param_grid
)

if xgb_tuning_result:
    print(f"✅ XGBoost tuning completed!")
    print(f"Best parameters: {xgb_tuning_result['best_params']}")
    print(f"Best CV score: {xgb_tuning_result['best_score']:.4f}")
else:
    print("❌ XGBoost tuning failed!")

## 📊 مقایسه قبل و بعد از بهینه‌سازی

In [None]:
# Compare performance before and after tuning
print("📊 Performance Comparison (Before vs After Tuning):")
print("=" * 60)

models_to_compare = ['Random Forest', 'XGBoost']
comparison_data = []

for model_name in models_to_compare:
    if model_name in results:
        # Get original performance
        original_r2 = results[model_name]['r2']
        original_rmse = results[model_name]['rmse']
        
        # Get tuned performance (if available)
        if model_name in trainer.models:
            # Retrain with best parameters to get new metrics
            tuned_model = trainer.models[model_name]
            y_pred_tuned = tuned_model.predict(X_test_scaled)
            
            from sklearn.metrics import r2_score, mean_squared_error
            tuned_r2 = r2_score(y_test, y_pred_tuned)
            tuned_rmse = np.sqrt(mean_squared_error(y_test, y_pred_tuned))
            
            improvement_r2 = tuned_r2 - original_r2
            improvement_rmse = original_rmse - tuned_rmse
            
            comparison_data.append({
                'Model': model_name,
                'Original_R²': original_r2,
                'Tuned_R²': tuned_r2,
                'R²_Improvement': improvement_r2,
                'Original_RMSE': original_rmse,
                'Tuned_RMSE': tuned_rmse,
                'RMSE_Improvement': improvement_rmse
            })
        else:
            print(f"⚠️ {model_name} not available for comparison")

if comparison_data:
    comparison_df = pd.DataFrame(comparison_data)
    print("\n📊 Detailed Comparison:")
    display(comparison_df.round(4))
    
    # Show improvements
    print("\n🎯 Summary of Improvements:")
    for _, row in comparison_df.iterrows():
        print(f"{row['Model']}:")
        print(f"  R²: {row['Original_R²']:.4f} → {row['Tuned_R²']:.4f} ({row['R²_Improvement']:+.4f})")
        print(f"  RMSE: {row['Original_RMSE']:.4f} → {row['Tuned_RMSE']:.4f} ({row['RMSE_Improvement']:+.4f})")
        print()
else:
    print("❌ No comparison data available!")

## 🎯 تحلیل خطاها

In [None]:
# Analyze prediction errors for best model
if best_model_info:
    best_model_name = best_model_info['name']
    y_pred_best = results[best_model_name]['y_pred']
    
    # Calculate errors
    errors = y_test - y_pred_best
    
    # Error statistics
    print(f"🎯 Error Analysis for {best_model_name}:")
    print("=" * 50)
    print(f"Mean Error: {errors.mean():.4f}")
    print(f"Std Error: {errors.std():.4f}")
    print(f"Min Error: {errors.min():.4f}")
    print(f"Max Error: {errors.max():.4f}")
    print(f"Mean Absolute Error: {np.abs(errors).mean():.4f}")
    print(f"Median Absolute Error: {np.median(np.abs(errors)):.4f}")
    
    # Error distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram of errors
    ax1.hist(errors, bins=30, alpha=0.7, edgecolor='black', color='lightcoral')
    ax1.set_title(f'Error Distribution - {best_model_name}')
    ax1.set_xlabel('Prediction Error')
    ax1.set_ylabel('Frequency')
    ax1.grid(True, alpha=0.3)
    
    # Scatter plot of actual vs predicted with error bands
    ax2.scatter(y_test, y_pred_best, alpha=0.6, color='blue')
    ax2.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    ax2.fill_between([y_test.min(), y_test.max()], 
                     [y_test.min() - errors.std(), y_test.max() - errors.std()],
                     [y_test.min() + errors.std(), y_test.max() + errors.std()], 
                     alpha=0.2, color='red', label=f'±1 Std Error ({errors.std():.2f})')
    ax2.set_xlabel('Actual Values')
    ax2.set_ylabel('Predicted Values')
    ax2.set_title(f'Actual vs Predicted - {best_model_name}')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Check for systematic errors
    print(f"\n🔍 Systematic Error Analysis:")
    if errors.mean() > 0.1:
        print(f"⚠️ Model tends to overpredict (mean error: {errors.mean():.4f})")
    elif errors.mean() < -0.1:
        print(f"⚠️ Model tends to underpredict (mean error: {errors.mean():.4f})")
    else:
        print(f"✅ Model predictions are well-balanced (mean error: {errors.mean():.4f})")
    
    # Check error variance
    if errors.std() > 5:
        print(f"⚠️ High prediction variance (std error: {errors.std():.4f})")
    else:
        print(f"✅ Predictions are consistent (std error: {errors.std():.4f})")
else:
    print("❌ No best model available for error analysis!")

## 💾 ذخیره مدل‌ها و نتایج

In [None]:
# Save best model
if best_model_info:
    best_model_name = best_model_info['name']
    print(f"💾 Saving best model: {best_model_name}")
    trainer.save_model(best_model_name, '../models/best_model.pkl')
    
    # Save all models
    print(f"💾 Saving all trained models...")
    for model_name in results.keys():
        trainer.save_model(model_name, f'../models/{model_name.lower().replace(" ", "_")}.pkl')
    
    print(f"✅ All models saved successfully!")
else:
    print("❌ No best model to save!")

In [None]:
# Save training results
import json

training_results = {}
for model_name, result in results.items():
    training_results[model_name] = {
        'r2': float(result['r2']),
        'rmse': float(result['rmse']),
        'mae': float(result['mae']),
        'cv_mean': float(result['cv_mean']),
        'cv_std': float(result['cv_std'])
    }

# Add feature importance
if feature_importance_results:
    training_results['feature_importance'] = {}
    for model_name, importance in feature_importance_results.items():
        training_results['feature_importance'][model_name] = {
            feature: float(score) for feature, score in importance.items()
        }

# Add best model info
if best_model_info:
    training_results['best_model'] = {
        'name': best_model_info['name'],
        'r2_score': float(best_model_info['r2_score']),
        'model_type': type(best_model_info['model']).__name__
    }

with open('../results/training_results.json', 'w') as f:
    json.dump(training_results, f, indent=2)

print("💾 Training results saved to 'results/training_results.json'")

## 📋 خلاصه مدل‌سازی

In [None]:
print("📋 Modeling Summary:")
print("=" * 50)
print(f"Total models trained: {len(results)}")
print(f"Best model: {best_model_info['name'] if best_model_info else 'None'}")
print(f"Best R² score: {trainer.best_score:.4f if trainer.best_score else 'N/A'}")
print(f"Models with feature importance: {len(feature_importance_results)}")
print(f"Hyperparameter tuning performed: {'Random Forest' in results and 'XGBoost' in results}")

print(f"\n🏆 Top 3 Models:")
top_3_models = sorted(results.items(), key=lambda x: x[1]['r2'], reverse=True)[:3]
for i, (model_name, result) in enumerate(top_3_models, 1):
    print(f"{i}. {model_name}: R² = {result['r2']:.4f}, RMSE = {result['rmse']:.4f}")

print(f"\n📁 Files generated:")
print(f"  - models/best_model.pkl")
print(f"  - models/*.pkl (all models)")
print(f"  - results/training_results.json")
print(f"  - Feature importance plots")
print(f"  - Model comparison plots")