# Model Development & Training - AI/ML Market Analysis

This notebook implements multiple machine learning models to predict AI market trends and analyze patterns.

## Objectives:
- Implement time series forecasting models (ARIMA/Prophet)
- Build machine learning models (XGBoost, Random Forest)
- Perform cross-validation and hyperparameter tuning
- Compare model performance and create ensembles
- Generate model evaluation metrics and diagnostics

## 1. Import Libraries and Load Data

In [None]:
# Import comprehensive ML libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Time Series
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except ImportError:
    PROPHET_AVAILABLE = False

# Utilities
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import joblib
from datetime import datetime

# Load data and features
processed_dir = Path('../data/processed')
results_dir = Path('../results')
market_df = pd.read_csv(results_dir / 'ai_market_engineered.csv')
features_scaled = pd.read_csv(results_dir / 'ai_market_features_standard_scaled.csv')


✅ Data and libraries loaded successfully!
Market data: (8, 162)
Scaled features: (8, 16)
Prophet available: True


## 2. Model Preparation and Data Splitting

In [2]:
# Prepare data for modeling
def prepare_modeling_data(df, target_col='ai_software_revenue_in_billions'):
    """
    Prepare data for machine learning models
    """
    # Clean data
    model_df = df.dropna(subset=[target_col])
    
    # Select features (exclude target and non-predictive columns)
    exclude_cols = [target_col, 'year'] + [col for col in df.columns if target_col in col and col != target_col]
    feature_cols = [col for col in model_df.columns if col not in exclude_cols and model_df[col].dtype in ['int64', 'float64']]
    
    X = model_df[feature_cols].fillna(model_df[feature_cols].mean())
    y = model_df[target_col]
    
    
    return X, y, feature_cols

# Prepare data for multiple targets
targets = {
    'ai_software_revenue_in_billions': 'AI Software Revenue',
    'global_ai_market_value_in_billions': 'AI Market Value',
    'ai_adoption': 'AI Adoption Rate'
}

modeling_data = {}

for target_col, target_name in targets.items():
    if target_col in market_df.columns:
        X, y, feature_cols = prepare_modeling_data(market_df, target_col)
        
        if len(X) > 0:
            modeling_data[target_col] = {
                'X': X,
                'y': y,
                'features': feature_cols,
                'name': target_name
            }
        else:



🔧 Preparing data for AI Software Revenue...
🎯 Target Variable: ai_software_revenue_in_billions
📊 Features: 128
📈 Samples: 8
📋 Feature columns: ['global_ai_market_value_in_billions', 'ai_adoption_%', 'organizations_using_ai', 'organizations_planning_to_implement_ai', 'global_expectation_for_ai_adoption_%']...

🔧 Preparing data for AI Market Value...
🎯 Target Variable: global_ai_market_value_in_billions
📊 Features: 128
📈 Samples: 8
📋 Feature columns: ['ai_software_revenue_in_billions', 'ai_adoption_%', 'organizations_using_ai', 'organizations_planning_to_implement_ai', 'global_expectation_for_ai_adoption_%']...

✅ Prepared data for 2 target variables


## 3. Time Series Forecasting Models

In [3]:
# Time series forecasting
def build_time_series_models(df, target_col, year_col='year', forecast_periods=3):
    """
    Build various time series forecasting models
    """
    if target_col not in df.columns or year_col not in df.columns:
        return None
    
    # Prepare time series data
    ts_data = df[[year_col, target_col]].dropna()
    ts_data = ts_data.sort_values(year_col)
    
    if len(ts_data) < 5:
        return None
    
    models = {}
    forecasts = {}
    
    
    # 1. Linear Trend Model
    try:
        X_trend = ts_data[year_col].values.reshape(-1, 1)
        y_trend = ts_data[target_col].values
        
        linear_model = LinearRegression()
        linear_model.fit(X_trend, y_trend)
        
        # Forecast
        future_years = np.arange(ts_data[year_col].max() + 1, 
                                ts_data[year_col].max() + forecast_periods + 1).reshape(-1, 1)
        linear_forecast = linear_model.predict(future_years)
        
        models['linear_trend'] = linear_model
        forecasts['linear_trend'] = {
            'years': future_years.flatten(),
            'values': linear_forecast
        }
        
        # Calculate R²
        r2 = linear_model.score(X_trend, y_trend)
    
    except Exception as e:
    
    # 2. Exponential Smoothing
    try:
        exp_model = ExponentialSmoothing(
            ts_data[target_col].values,
            trend='add',
            seasonal=None,
            initialization_method='estimated'
        ).fit()
        
        exp_forecast = exp_model.forecast(forecast_periods)
        
        models['exponential_smoothing'] = exp_model
        forecasts['exponential_smoothing'] = {
            'years': np.arange(ts_data[year_col].max() + 1, 
                              ts_data[year_col].max() + forecast_periods + 1),
            'values': exp_forecast
        }
        
    
    except Exception as e:
    
    # 3. ARIMA Model
    try:
        # Auto-select ARIMA parameters (simple approach)
        arima_model = ARIMA(ts_data[target_col].values, order=(1, 1, 1))
        arima_fitted = arima_model.fit()
        
        arima_forecast = arima_fitted.forecast(steps=forecast_periods)
        
        models['arima'] = arima_fitted
        forecasts['arima'] = {
            'years': np.arange(ts_data[year_col].max() + 1, 
                              ts_data[year_col].max() + forecast_periods + 1),
            'values': arima_forecast
        }
        
    
    except Exception as e:
    
    # 4. Prophet Model (if available)
    if PROPHET_AVAILABLE:
        try:
            # Prepare Prophet data format
            prophet_df = pd.DataFrame({
                'ds': pd.to_datetime(ts_data[year_col], format='%Y'),
                'y': ts_data[target_col].values
            })
            
            prophet_model = Prophet(
                yearly_seasonality=False,
                weekly_seasonality=False,
                daily_seasonality=False
            )
            prophet_model.fit(prophet_df)
            
            # Create future dataframe
            future = prophet_model.make_future_dataframe(periods=forecast_periods, freq='Y')
            prophet_forecast = prophet_model.predict(future)
            
            models['prophet'] = prophet_model
            forecasts['prophet'] = {
                'years': np.arange(ts_data[year_col].max() + 1, 
                                  ts_data[year_col].max() + forecast_periods + 1),
                'values': prophet_forecast['yhat'].tail(forecast_periods).values
            }
            
        
        except Exception as e:
    
    return models, forecasts, ts_data

# Build time series models for key metrics
ts_models = {}
ts_forecasts = {}

for target_col, target_name in targets.items():
    if target_col in market_df.columns:
        models, forecasts, ts_data = build_time_series_models(market_df, target_col)
        
        if models:
            ts_models[target_col] = models
            ts_forecasts[target_col] = forecasts



🕒 Time Series Modeling for AI Software Revenue
⏰ Building time series models for ai_software_revenue_in_billions...
   ✅ Linear Trend Model: R² = 0.938
   ✅ Exponential Smoothing Model: AIC = 30.8
   ✅ ARIMA Model: AIC = 47.2


14:13:56 - cmdstanpy - INFO - Chain [1] start processing
14:13:57 - cmdstanpy - INFO - Chain [1] done processing


   ✅ Prophet Model: Successfully trained

🕒 Time Series Modeling for AI Market Value
⏰ Building time series models for global_ai_market_value_in_billions...
   ✅ Linear Trend Model: R² = 0.528
   ✅ Exponential Smoothing Model: AIC = 105.9
   ✅ ARIMA Model: AIC = 113.9


14:13:57 - cmdstanpy - INFO - Chain [1] start processing
14:13:57 - cmdstanpy - INFO - Chain [1] done processing


   ✅ Prophet Model: Successfully trained

✅ Time series models built for 2 targets


## 4. Machine Learning Models

In [4]:
# Build machine learning models
def build_ml_models(X, y, test_size=0.3, random_state=42):
    """
    Build and evaluate multiple ML models
    """
    if len(X) < 5:
        return None, None
    
    # Split data (time series aware)
    split_idx = int(len(X) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    
    # Define models
    models = {
        'linear_regression': LinearRegression(),
        'ridge_regression': Ridge(alpha=1.0, random_state=random_state),
        'lasso_regression': Lasso(alpha=0.1, random_state=random_state),
        'random_forest': RandomForestRegressor(n_estimators=100, random_state=random_state),
        'gradient_boosting': GradientBoostingRegressor(n_estimators=100, random_state=random_state),
        'xgboost': xgb.XGBRegressor(n_estimators=100, random_state=random_state)
    }
    
    # Train and evaluate models
    results = {}
    trained_models = {}
    
    for name, model in models.items():
        try:
            # Train model
            model.fit(X_train, y_train)
            
            # Predictions
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test) if len(X_test) > 0 else np.array([])
            
            # Metrics
            train_r2 = r2_score(y_train, y_train_pred)
            train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
            train_mae = mean_absolute_error(y_train, y_train_pred)
            
            result = {
                'train_r2': train_r2,
                'train_rmse': train_rmse,
                'train_mae': train_mae,
                'train_pred': y_train_pred,
                'test_pred': y_test_pred
            }
            
            if len(X_test) > 0:
                test_r2 = r2_score(y_test, y_test_pred)
                test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
                test_mae = mean_absolute_error(y_test, y_test_pred)
                
                result.update({
                    'test_r2': test_r2,
                    'test_rmse': test_rmse,
                    'test_mae': test_mae
                })
            
            results[name] = result
            trained_models[name] = model
            
        
        except Exception as e:
    
    return results, trained_models

# Build ML models for each target
ml_results = {}
ml_models = {}

for target_col, data in modeling_data.items():
    
    results, models = build_ml_models(data['X'], data['y'])
    
    if results:
        ml_results[target_col] = results
        ml_models[target_col] = models



🤖 Building ML Models for AI Software Revenue
📊 Data split: Train=5, Test=3
   ✅ Linear Regression: R² = 1.000, RMSE = 0.00
   ✅ Ridge Regression: R² = 1.000, RMSE = 0.00
   ✅ Lasso Regression: R² = 1.000, RMSE = 0.00
   ✅ Random Forest: R² = 0.914, RMSE = 4.38
   ✅ Gradient Boosting: R² = 1.000, RMSE = 0.00
   ✅ Xgboost: R² = 1.000, RMSE = 0.00

🤖 Building ML Models for AI Market Value
📊 Data split: Train=5, Test=3
   ✅ Linear Regression: R² = 1.000, RMSE = 0.00
   ✅ Ridge Regression: R² = 1.000, RMSE = 0.02
   ✅ Lasso Regression: R² = 1.000, RMSE = 0.02
   ✅ Random Forest: R² = 0.860, RMSE = 15.09
   ✅ Gradient Boosting: R² = 1.000, RMSE = 0.00
   ✅ Xgboost: R² = 1.000, RMSE = 0.00

✅ ML models built for 2 targets


## 5. Model Comparison and Evaluation

In [5]:
# Compare model performance
def compare_model_performance(results_dict, target_name):
    """
    Create comprehensive model comparison
    """
    
    # Create comparison DataFrame
    comparison_data = []
    
    for model_name, metrics in results_dict.items():
        row = {
            'Model': model_name.replace('_', ' ').title(),
            'Train_R2': metrics.get('train_r2', np.nan),
            'Train_RMSE': metrics.get('train_rmse', np.nan),
            'Train_MAE': metrics.get('train_mae', np.nan),
            'Test_R2': metrics.get('test_r2', np.nan),
            'Test_RMSE': metrics.get('test_rmse', np.nan),
            'Test_MAE': metrics.get('test_mae', np.nan)
        }
        comparison_data.append(row)
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Display results
    pd.set_option('display.precision', 3)
    display(comparison_df)
    
    # Find best model
    if 'Train_R2' in comparison_df.columns:
        best_model_idx = comparison_df['Train_R2'].idxmax()
        best_model = comparison_df.loc[best_model_idx, 'Model']
        best_r2 = comparison_df.loc[best_model_idx, 'Train_R2']
        
    
    # Create visualization
    if len(comparison_df) > 1:
        fig = go.Figure()
        
        fig.add_trace(go.Bar(
            x=comparison_df['Model'],
            y=comparison_df['Train_R2'],
            name='Training R²',
            marker_color='lightblue'
        ))
        
        if 'Test_R2' in comparison_df.columns and not comparison_df['Test_R2'].isna().all():
            fig.add_trace(go.Bar(
                x=comparison_df['Model'],
                y=comparison_df['Test_R2'],
                name='Testing R²',
                marker_color='lightcoral'
            ))
        
        fig.update_layout(
            title=f'Model Performance Comparison - {target_name}',
            xaxis_title='Model',
            yaxis_title='R² Score',
            height=500
        )
        
        fig.show()
    
    return comparison_df

# Compare models for each target
model_comparisons = {}

for target_col, results in ml_results.items():
    target_name = targets.get(target_col, target_col)
    comparison = compare_model_performance(results, target_name)
    model_comparisons[target_col] = comparison


📊 MODEL PERFORMANCE COMPARISON - AI Software Revenue


Unnamed: 0,Model,Train_R2,Train_RMSE,Train_MAE,Test_R2,Test_RMSE,Test_MAE
0,Linear Regression,1.0,1.095e-14,7.816e-15,0.39,17.621,16.624
1,Ridge Regression,1.0,4.755e-05,3.659e-05,0.39,17.62,16.624
2,Lasso Regression,1.0,0.002275,0.001989,-92.514,218.156,141.793
3,Random Forest,0.914,4.375,3.309,-7.892,67.273,62.642
4,Gradient Boosting,1.0,0.0003952,0.0003478,-5.576,57.85,53.097
5,Xgboost,1.0,0.0006559,0.0004485,-4.13,51.097,45.848



🏆 Best Model: Linear Regression (R² = 1.000)



📊 MODEL PERFORMANCE COMPARISON - AI Market Value


Unnamed: 0,Model,Train_R2,Train_RMSE,Train_MAE,Test_R2,Test_RMSE,Test_MAE
0,Linear Regression,1.0,3.75e-14,2.629e-14,-0.17,751.77,490.663
1,Ridge Regression,1.0,0.01584,0.01234,-0.17,751.831,490.731
2,Lasso Regression,1.0,0.01678,0.01337,-0.38,816.463,531.413
3,Random Forest,0.86,15.09,11.56,-1.127,1013.66,736.287
4,Gradient Boosting,1.0,0.001079,0.0008415,-1.033,990.922,704.526
5,Xgboost,1.0,0.0006827,0.000457,-0.985,979.086,689.668



🏆 Best Model: Linear Regression (R² = 1.000)


## 6. Feature Importance Analysis

In [6]:
# Analyze feature importance from tree-based models
def analyze_feature_importance(models_dict, feature_names, target_name):
    """
    Extract and visualize feature importance
    """
    importance_data = {}
    
    # Extract importance from tree-based models
    tree_models = ['random_forest', 'gradient_boosting', 'xgboost']
    
    for model_name in tree_models:
        if model_name in models_dict:
            model = models_dict[model_name]
            
            if hasattr(model, 'feature_importances_'):
                importance_data[model_name] = model.feature_importances_
    
    if importance_data:
        # Create importance DataFrame
        importance_df = pd.DataFrame(importance_data, index=feature_names)
        
        # Calculate average importance
        importance_df['average'] = importance_df.mean(axis=1)
        importance_df = importance_df.sort_values('average', ascending=False)
        
        
        # Show top 10 features
        top_features = importance_df.head(10)
        for i, (feature, row) in enumerate(top_features.iterrows(), 1):
            avg_importance = row['average']
            feature_display = feature.replace('_', ' ').title()[:40]
        
        # Create visualization
        fig = go.Figure()
        
        colors = px.colors.qualitative.Set3
        for i, model_name in enumerate(importance_data.keys()):
            fig.add_trace(go.Bar(
                x=top_features.index,
                y=top_features[model_name],
                name=model_name.replace('_', ' ').title(),
                marker_color=colors[i % len(colors)]
            ))
        
        fig.update_layout(
            title=f'Feature Importance Comparison - {target_name}',
            xaxis_title='Features',
            yaxis_title='Importance Score',
            height=600,
            xaxis_tickangle=-45
        )
        
        fig.show()
        
        return importance_df
    
    return None

# Analyze feature importance for each target
feature_importance_results = {}

for target_col, models in ml_models.items():
    if target_col in modeling_data:
        target_name = modeling_data[target_col]['name']
        feature_names = modeling_data[target_col]['features']
        
        importance_df = analyze_feature_importance(models, feature_names, target_name)
        if importance_df is not None:
            feature_importance_results[target_col] = importance_df


🎯 FEATURE IMPORTANCE - AI Software Revenue
 1. Global Ai Market Value In Billions: 0.348
 2. Estimated Jobs Eliminated By Ai Millions: 0.066
 3. Estimated New Jobs Created By Ai Million: 0.060
 4. Organizations Using Ai Lag 3: 0.042
 5. Estimated Revenue Increase From Ai Trill: 0.036
 6. Investment Attractiveness: 0.029
 7. Global Ai Market Value In Billions Ratio: 0.024
 8. Jobs At High Risk Of Automation  Transpo: 0.013
 9. Early Period: 0.012
10. Marketers Believing Ai Improves Email Re: 0.012



🎯 FEATURE IMPORTANCE - AI Market Value
 1. Ai Software Revenue In Billions: 0.334
 2. Estimated New Jobs Created By Ai Million: 0.070
 3. Estimated Jobs Eliminated By Ai Millions: 0.052
 4. Estimated Revenue Increase From Ai Trill: 0.044
 5. Estimated Revenue Increase From Ai Trill: 0.040
 6. Investment Attractiveness: 0.037
 7. Ai Software Revenue In Billions Ratio La: 0.027
 8. Organizations Using Ai Ratio Lag 3: 0.020
 9. Ai Software Revenue In Billions Growth A: 0.019
10. Ai Software Revenue In Billions Std 5: 0.016


## 7. Cross-Validation and Model Robustness

In [7]:
# Perform cross-validation
def perform_cross_validation(X, y, models_dict, cv_folds=3):
    """
    Perform time series cross-validation
    """
    if len(X) < cv_folds + 2:
        return None
    
    # Use TimeSeriesSplit for time series data
    tscv = TimeSeriesSplit(n_splits=cv_folds)
    
    cv_results = {}
    
    for name, model in models_dict.items():
        try:
            # Perform cross-validation
            scores = cross_val_score(model, X, y, cv=tscv, scoring='r2')
            
            cv_results[name] = {
                'scores': scores,
                'mean': scores.mean(),
                'std': scores.std(),
                'min': scores.min(),
                'max': scores.max()
            }
            
        
        except Exception as e:
    
    return cv_results

# Perform cross-validation for each target
cv_results_all = {}

for target_col, models in ml_models.items():
    if target_col in modeling_data:
        target_name = modeling_data[target_col]['name']
        X = modeling_data[target_col]['X']
        y = modeling_data[target_col]['y']
        
        
        cv_results = perform_cross_validation(X, y, models)
        if cv_results:
            cv_results_all[target_col] = cv_results


🔄 Cross-Validation for AI Software Revenue
📊 Linear Regression:
   CV R² = -12.359 ± 8.556
   Range: [-24.274, -4.574]
📊 Ridge Regression:
   CV R² = -12.359 ± 8.556
   Range: [-24.274, -4.574]
📊 Lasso Regression:
   CV R² = -37.072 ± 25.640
   Range: [-68.484, -5.680]
📊 Random Forest:
   CV R² = -10.635 ± 3.025
   Range: [-13.503, -6.452]
📊 Gradient Boosting:
   CV R² = -7.465 ± 1.222
   Range: [-8.844, -5.873]
📊 Xgboost:
   CV R² = -6.175 ± 0.770
   Range: [-7.116, -5.229]

🔄 Cross-Validation for AI Market Value
📊 Linear Regression:
   CV R² = -1.424 ± 0.623
   Range: [-1.979, -0.553]
📊 Ridge Regression:
   CV R² = -1.424 ± 0.624
   Range: [-1.979, -0.552]
📊 Lasso Regression:
   CV R² = -0.512 ± 0.982
   Range: [-1.715, 0.691]
📊 Random Forest:
   CV R² = -3.888 ± 1.623
   Range: [-5.244, -1.607]
📊 Gradient Boosting:
   CV R² = -3.520 ± 1.499
   Range: [-4.626, -1.400]
📊 Xgboost:
   CV R² = -3.232 ± 1.321
   Range: [-4.323, -1.373]


## 8. Model Ensemble and Final Predictions

In [8]:
# Create ensemble models
def create_ensemble_model(models_dict, X, y):
    """
    Create simple ensemble using best performing models
    """
    # Get predictions from all models
    predictions = []
    model_names = []
    
    for name, model in models_dict.items():
        try:
            pred = model.predict(X)
            predictions.append(pred)
            model_names.append(name)
        except Exception as e:
    
    if len(predictions) >= 2:
        # Simple average ensemble
        ensemble_pred = np.mean(predictions, axis=0)
        
        # Weighted ensemble (weight by R² score)
        weights = []
        for name in model_names:
            try:
                pred = model.predict(X)
                r2 = r2_score(y, pred)
                weights.append(max(0, r2))  # Use 0 for negative R²
            except:
                weights.append(0)
        
        if sum(weights) > 0:
            weights = np.array(weights) / sum(weights)
            weighted_ensemble_pred = np.average(predictions, axis=0, weights=weights)
        else:
            weighted_ensemble_pred = ensemble_pred
        
        # Evaluate ensemble
        simple_r2 = r2_score(y, ensemble_pred)
        weighted_r2 = r2_score(y, weighted_ensemble_pred)
        
        
        return {
            'simple_ensemble': ensemble_pred,
            'weighted_ensemble': weighted_ensemble_pred,
            'simple_r2': simple_r2,
            'weighted_r2': weighted_r2,
            'model_names': model_names,
            'weights': weights
        }
    
    return None

# Create ensembles for each target
ensemble_results = {}

for target_col, models in ml_models.items():
    if target_col in modeling_data:
        target_name = modeling_data[target_col]['name']
        X = modeling_data[target_col]['X']
        y = modeling_data[target_col]['y']
        
        
        ensemble = create_ensemble_model(models, X, y)
        if ensemble:
            ensemble_results[target_col] = ensemble


🔗 Creating Ensemble for AI Software Revenue
🔗 Ensemble Results:
   Simple Average: R² = 0.857
   Weighted Average: R² = 0.857
   Models in ensemble: 6

🔗 Creating Ensemble for AI Market Value
🔗 Ensemble Results:
   Simple Average: R² = 0.086
   Weighted Average: R² = 0.086
   Models in ensemble: 6


## 9. Save Models and Results

In [9]:
# Save trained models and results
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

results_dir = Path('../results')
results_dir.mkdir(exist_ok=True)


# Save ML models
for target_col, models in ml_models.items():
    target_dir = models_dir / target_col.replace('_', '')
    target_dir.mkdir(exist_ok=True)
    
    for model_name, model in models.items():
        model_path = target_dir / f'{model_name}.joblib'
        joblib.dump(model, model_path)

# Save model comparison results
for target_col, comparison in model_comparisons.items():
    comparison_path = results_dir / f'model_comparison_{target_col}.csv'
    comparison.to_csv(comparison_path, index=False)

# Save cross-validation results
cv_summary = []
for target_col, cv_data in cv_results_all.items():
    for model_name, scores in cv_data.items():
        cv_summary.append({
            'target': target_col,
            'model': model_name,
            'cv_mean': scores['mean'],
            'cv_std': scores['std'],
            'cv_min': scores['min'],
            'cv_max': scores['max']
        })

if cv_summary:
    cv_df = pd.DataFrame(cv_summary)
    cv_df.to_csv(results_dir / 'cross_validation_results.csv', index=False)

# Save ensemble results
ensemble_summary = []
for target_col, ensemble in ensemble_results.items():
    ensemble_summary.append({
        'target': target_col,
        'simple_ensemble_r2': ensemble['simple_r2'],
        'weighted_ensemble_r2': ensemble['weighted_r2'],
        'models_count': len(ensemble['model_names']),
        'models_used': ', '.join(ensemble['model_names'])
    })

if ensemble_summary:
    ensemble_df = pd.DataFrame(ensemble_summary)
    ensemble_df.to_csv(results_dir / 'ensemble_results.csv', index=False)

# Create model development summary


💾 SAVING MODELS AND RESULTS
   ✅ Saved linear_regression for ai_software_revenue_in_billions
   ✅ Saved ridge_regression for ai_software_revenue_in_billions
   ✅ Saved lasso_regression for ai_software_revenue_in_billions
   ✅ Saved random_forest for ai_software_revenue_in_billions
   ✅ Saved gradient_boosting for ai_software_revenue_in_billions
   ✅ Saved xgboost for ai_software_revenue_in_billions
   ✅ Saved linear_regression for global_ai_market_value_in_billions
   ✅ Saved ridge_regression for global_ai_market_value_in_billions
   ✅ Saved lasso_regression for global_ai_market_value_in_billions
   ✅ Saved random_forest for global_ai_market_value_in_billions
   ✅ Saved gradient_boosting for global_ai_market_value_in_billions
   ✅ Saved xgboost for global_ai_market_value_in_billions
   📊 Saved comparison for ai_software_revenue_in_billions
   📊 Saved comparison for global_ai_market_value_in_billions
   📊 Saved cross-validation results
   🔗 Saved ensemble results

📋 MODEL DEVELOPMENT SU