# Model Development & Training - AI/ML Market Analysis

This notebook implements multiple machine learning models to predict AI market trends and analyze patterns.

## Objectives:
- Implement time series forecasting models (ARIMA/Prophet)
- Build machine learning models (XGBoost, Random Forest)
- Perform cross-validation and hyperparameter tuning
- Compare model performance and create ensembles
- Generate model evaluation metrics and diagnostics

## 1. Import Libraries and Load Data

In [None]:
# Import comprehensive ML libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Time Series
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except ImportError:
    print("⚠️ Prophet not available, will use alternative time series methods")
    PROPHET_AVAILABLE = False

# Utilities
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import joblib
from datetime import datetime

# Load engineered data
processed_dir = Path('../data/processed')
market_df = pd.read_csv(processed_dir / 'ai_market_engineered.csv')
features_scaled = pd.read_csv(processed_dir / 'ai_market_features_standard_scaled.csv')

print("✅ Data and libraries loaded successfully!")
print(f"Market data: {market_df.shape}")
print(f"Scaled features: {features_scaled.shape}")
print(f"Prophet available: {PROPHET_AVAILABLE}")

## 2. Model Preparation and Data Splitting

In [None]:
# Prepare data for modeling
def prepare_modeling_data(df, target_col='ai_software_revenue_in_billions'):
    """
    Prepare data for machine learning models
    """
    # Clean data
    model_df = df.dropna(subset=[target_col])
    
    # Select features (exclude target and non-predictive columns)
    exclude_cols = [target_col, 'year'] + [col for col in df.columns if target_col in col and col != target_col]
    feature_cols = [col for col in model_df.columns if col not in exclude_cols and model_df[col].dtype in ['int64', 'float64']]
    
    X = model_df[feature_cols].fillna(model_df[feature_cols].mean())
    y = model_df[target_col]
    
    print(f"🎯 Target Variable: {target_col}")
    print(f"📊 Features: {len(feature_cols)}")
    print(f"📈 Samples: {len(X)}")
    print(f"📋 Feature columns: {feature_cols[:5]}..." if len(feature_cols) > 5 else f"📋 Feature columns: {feature_cols}")
    
    return X, y, feature_cols

# Prepare data for multiple targets
targets = {
    'ai_software_revenue_in_billions': 'AI Software Revenue',
    'global_ai_market_value_in_billions': 'AI Market Value',
    'ai_adoption': 'AI Adoption Rate'
}

modeling_data = {}

for target_col, target_name in targets.items():
    if target_col in market_df.columns:
        print(f"\n🔧 Preparing data for {target_name}...")
        X, y, feature_cols = prepare_modeling_data(market_df, target_col)
        
        if len(X) > 0:
            modeling_data[target_col] = {
                'X': X,
                'y': y,
                'features': feature_cols,
                'name': target_name
            }
        else:
            print(f"❌ Insufficient data for {target_name}")

print(f"\n✅ Prepared data for {len(modeling_data)} target variables")

## 3. Time Series Forecasting Models

In [None]:
# Time series forecasting
def build_time_series_models(df, target_col, year_col='year', forecast_periods=3):
    """
    Build various time series forecasting models
    """
    if target_col not in df.columns or year_col not in df.columns:
        return None
    
    # Prepare time series data
    ts_data = df[[year_col, target_col]].dropna()
    ts_data = ts_data.sort_values(year_col)
    
    if len(ts_data) < 5:
        print(f"⚠️ Insufficient data for time series modeling: {len(ts_data)} points")
        return None
    
    models = {}
    forecasts = {}
    
    print(f"⏰ Building time series models for {target_col}...")
    
    # 1. Linear Trend Model
    try:
        X_trend = ts_data[year_col].values.reshape(-1, 1)
        y_trend = ts_data[target_col].values
        
        linear_model = LinearRegression()
        linear_model.fit(X_trend, y_trend)
        
        # Forecast
        future_years = np.arange(ts_data[year_col].max() + 1, 
                                ts_data[year_col].max() + forecast_periods + 1).reshape(-1, 1)
        linear_forecast = linear_model.predict(future_years)
        
        models['linear_trend'] = linear_model
        forecasts['linear_trend'] = {
            'years': future_years.flatten(),
            'values': linear_forecast
        }
        
        # Calculate R²
        r2 = linear_model.score(X_trend, y_trend)
        print(f"   ✅ Linear Trend Model: R² = {r2:.3f}")
    
    except Exception as e:
        print(f"   ❌ Linear trend model failed: {e}")
    
    # 2. Exponential Smoothing
    try:
        exp_model = ExponentialSmoothing(
            ts_data[target_col].values,
            trend='add',
            seasonal=None,
            initialization_method='estimated'
        ).fit()
        
        exp_forecast = exp_model.forecast(forecast_periods)
        
        models['exponential_smoothing'] = exp_model
        forecasts['exponential_smoothing'] = {
            'years': np.arange(ts_data[year_col].max() + 1, 
                              ts_data[year_col].max() + forecast_periods + 1),
            'values': exp_forecast
        }
        
        print(f"   ✅ Exponential Smoothing Model: AIC = {exp_model.aic:.1f}")
    
    except Exception as e:
        print(f"   ❌ Exponential smoothing failed: {e}")
    
    # 3. ARIMA Model
    try:
        # Auto-select ARIMA parameters (simple approach)
        arima_model = ARIMA(ts_data[target_col].values, order=(1, 1, 1))
        arima_fitted = arima_model.fit()
        
        arima_forecast = arima_fitted.forecast(steps=forecast_periods)
        
        models['arima'] = arima_fitted
        forecasts['arima'] = {
            'years': np.arange(ts_data[year_col].max() + 1, 
                              ts_data[year_col].max() + forecast_periods + 1),
            'values': arima_forecast
        }
        
        print(f"   ✅ ARIMA Model: AIC = {arima_fitted.aic:.1f}")
    
    except Exception as e:
        print(f"   ❌ ARIMA model failed: {e}")
    
    # 4. Prophet Model (if available)
    if PROPHET_AVAILABLE:
        try:
            # Prepare Prophet data format
            prophet_df = pd.DataFrame({
                'ds': pd.to_datetime(ts_data[year_col], format='%Y'),
                'y': ts_data[target_col].values
            })
            
            prophet_model = Prophet(
                yearly_seasonality=False,
                weekly_seasonality=False,
                daily_seasonality=False
            )
            prophet_model.fit(prophet_df)
            
            # Create future dataframe
            future = prophet_model.make_future_dataframe(periods=forecast_periods, freq='Y')
            prophet_forecast = prophet_model.predict(future)
            
            models['prophet'] = prophet_model
            forecasts['prophet'] = {
                'years': np.arange(ts_data[year_col].max() + 1, 
                                  ts_data[year_col].max() + forecast_periods + 1),
                'values': prophet_forecast['yhat'].tail(forecast_periods).values
            }
            
            print(f"   ✅ Prophet Model: Successfully trained")
        
        except Exception as e:
            print(f"   ❌ Prophet model failed: {e}")
    
    return models, forecasts, ts_data

# Build time series models for key metrics
ts_models = {}
ts_forecasts = {}

for target_col, target_name in targets.items():
    if target_col in market_df.columns:
        print(f"\n🕒 Time Series Modeling for {target_name}")
        print("=" * 50)
        models, forecasts, ts_data = build_time_series_models(market_df, target_col)
        
        if models:
            ts_models[target_col] = models
            ts_forecasts[target_col] = forecasts

print(f"\n✅ Time series models built for {len(ts_models)} targets")

## 4. Machine Learning Models

In [None]:
# Build machine learning models
def build_ml_models(X, y, test_size=0.3, random_state=42):
    """
    Build and evaluate multiple ML models
    """
    if len(X) < 5:
        print("❌ Insufficient data for ML modeling")
        return None, None
    
    # Split data (time series aware)
    split_idx = int(len(X) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"📊 Data split: Train={len(X_train)}, Test={len(X_test)}")
    
    # Define models
    models = {
        'linear_regression': LinearRegression(),
        'ridge_regression': Ridge(alpha=1.0, random_state=random_state),
        'lasso_regression': Lasso(alpha=0.1, random_state=random_state),
        'random_forest': RandomForestRegressor(n_estimators=100, random_state=random_state),
        'gradient_boosting': GradientBoostingRegressor(n_estimators=100, random_state=random_state),
        'xgboost': xgb.XGBRegressor(n_estimators=100, random_state=random_state)
    }
    
    # Train and evaluate models
    results = {}
    trained_models = {}
    
    for name, model in models.items():
        try:
            # Train model
            model.fit(X_train, y_train)
            
            # Predictions
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test) if len(X_test) > 0 else np.array([])
            
            # Metrics
            train_r2 = r2_score(y_train, y_train_pred)
            train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
            train_mae = mean_absolute_error(y_train, y_train_pred)
            
            result = {
                'train_r2': train_r2,
                'train_rmse': train_rmse,
                'train_mae': train_mae,
                'train_pred': y_train_pred,
                'test_pred': y_test_pred
            }
            
            if len(X_test) > 0:
                test_r2 = r2_score(y_test, y_test_pred)
                test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
                test_mae = mean_absolute_error(y_test, y_test_pred)
                
                result.update({
                    'test_r2': test_r2,
                    'test_rmse': test_rmse,
                    'test_mae': test_mae
                })
            
            results[name] = result
            trained_models[name] = model
            
            print(f"   ✅ {name.replace('_', ' ').title()}: R² = {train_r2:.3f}, RMSE = {train_rmse:.2f}")
        
        except Exception as e:
            print(f"   ❌ {name} failed: {e}")
    
    return results, trained_models

# Build ML models for each target
ml_results = {}
ml_models = {}

for target_col, data in modeling_data.items():
    print(f"\n🤖 Building ML Models for {data['name']}")
    print("=" * 50)
    
    results, models = build_ml_models(data['X'], data['y'])
    
    if results:
        ml_results[target_col] = results
        ml_models[target_col] = models

print(f"\n✅ ML models built for {len(ml_results)} targets")

## 5. Model Comparison and Evaluation

In [None]:
# Compare model performance
def compare_model_performance(results_dict, target_name):
    """
    Create comprehensive model comparison
    """
    print(f"\n📊 MODEL PERFORMANCE COMPARISON - {target_name}")
    print("=" * 60)
    
    # Create comparison DataFrame
    comparison_data = []
    
    for model_name, metrics in results_dict.items():
        row = {
            'Model': model_name.replace('_', ' ').title(),
            'Train_R2': metrics.get('train_r2', np.nan),
            'Train_RMSE': metrics.get('train_rmse', np.nan),
            'Train_MAE': metrics.get('train_mae', np.nan),
            'Test_R2': metrics.get('test_r2', np.nan),
            'Test_RMSE': metrics.get('test_rmse', np.nan),
            'Test_MAE': metrics.get('test_mae', np.nan)
        }
        comparison_data.append(row)
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Display results
    pd.set_option('display.precision', 3)
    display(comparison_df)
    
    # Find best model
    if 'Train_R2' in comparison_df.columns:
        best_model_idx = comparison_df['Train_R2'].idxmax()
        best_model = comparison_df.loc[best_model_idx, 'Model']
        best_r2 = comparison_df.loc[best_model_idx, 'Train_R2']
        
        print(f"\n🏆 Best Model: {best_model} (R² = {best_r2:.3f})")
    
    # Create visualization
    if len(comparison_df) > 1:
        fig = go.Figure()
        
        fig.add_trace(go.Bar(
            x=comparison_df['Model'],
            y=comparison_df['Train_R2'],
            name='Training R²',
            marker_color='lightblue'
        ))
        
        if 'Test_R2' in comparison_df.columns and not comparison_df['Test_R2'].isna().all():
            fig.add_trace(go.Bar(
                x=comparison_df['Model'],
                y=comparison_df['Test_R2'],
                name='Testing R²',
                marker_color='lightcoral'
            ))
        
        fig.update_layout(
            title=f'Model Performance Comparison - {target_name}',
            xaxis_title='Model',
            yaxis_title='R² Score',
            height=500
        )
        
        fig.show()
    
    return comparison_df

# Compare models for each target
model_comparisons = {}

for target_col, results in ml_results.items():
    target_name = targets.get(target_col, target_col)
    comparison = compare_model_performance(results, target_name)
    model_comparisons[target_col] = comparison

## 6. Feature Importance Analysis

In [None]:
# Analyze feature importance from tree-based models
def analyze_feature_importance(models_dict, feature_names, target_name):
    """
    Extract and visualize feature importance
    """
    importance_data = {}
    
    # Extract importance from tree-based models
    tree_models = ['random_forest', 'gradient_boosting', 'xgboost']
    
    for model_name in tree_models:
        if model_name in models_dict:
            model = models_dict[model_name]
            
            if hasattr(model, 'feature_importances_'):
                importance_data[model_name] = model.feature_importances_
    
    if importance_data:
        # Create importance DataFrame
        importance_df = pd.DataFrame(importance_data, index=feature_names)
        
        # Calculate average importance
        importance_df['average'] = importance_df.mean(axis=1)
        importance_df = importance_df.sort_values('average', ascending=False)
        
        print(f"\n🎯 FEATURE IMPORTANCE - {target_name}")
        print("=" * 50)
        
        # Show top 10 features
        top_features = importance_df.head(10)
        for i, (feature, row) in enumerate(top_features.iterrows(), 1):
            avg_importance = row['average']
            feature_display = feature.replace('_', ' ').title()[:40]
            print(f"{i:2d}. {feature_display}: {avg_importance:.3f}")
        
        # Create visualization
        fig = go.Figure()
        
        colors = px.colors.qualitative.Set3
        for i, model_name in enumerate(importance_data.keys()):
            fig.add_trace(go.Bar(
                x=top_features.index,
                y=top_features[model_name],
                name=model_name.replace('_', ' ').title(),
                marker_color=colors[i % len(colors)]
            ))
        
        fig.update_layout(
            title=f'Feature Importance Comparison - {target_name}',
            xaxis_title='Features',
            yaxis_title='Importance Score',
            height=600,
            xaxis_tickangle=-45
        )
        
        fig.show()
        
        return importance_df
    
    return None

# Analyze feature importance for each target
feature_importance_results = {}

for target_col, models in ml_models.items():
    if target_col in modeling_data:
        target_name = modeling_data[target_col]['name']
        feature_names = modeling_data[target_col]['features']
        
        importance_df = analyze_feature_importance(models, feature_names, target_name)
        if importance_df is not None:
            feature_importance_results[target_col] = importance_df

## 7. Cross-Validation and Model Robustness

In [None]:
# Perform cross-validation
def perform_cross_validation(X, y, models_dict, cv_folds=3):
    """
    Perform time series cross-validation
    """
    if len(X) < cv_folds + 2:
        print(f"⚠️ Insufficient data for {cv_folds}-fold CV: {len(X)} samples")
        return None
    
    # Use TimeSeriesSplit for time series data
    tscv = TimeSeriesSplit(n_splits=cv_folds)
    
    cv_results = {}
    
    for name, model in models_dict.items():
        try:
            # Perform cross-validation
            scores = cross_val_score(model, X, y, cv=tscv, scoring='r2')
            
            cv_results[name] = {
                'scores': scores,
                'mean': scores.mean(),
                'std': scores.std(),
                'min': scores.min(),
                'max': scores.max()
            }
            
            print(f"📊 {name.replace('_', ' ').title()}:")
            print(f"   CV R² = {scores.mean():.3f} ± {scores.std():.3f}")
            print(f"   Range: [{scores.min():.3f}, {scores.max():.3f}]")
        
        except Exception as e:
            print(f"   ❌ CV failed for {name}: {e}")
    
    return cv_results

# Perform cross-validation for each target
cv_results_all = {}

for target_col, models in ml_models.items():
    if target_col in modeling_data:
        target_name = modeling_data[target_col]['name']
        X = modeling_data[target_col]['X']
        y = modeling_data[target_col]['y']
        
        print(f"\n🔄 Cross-Validation for {target_name}")
        print("=" * 50)
        
        cv_results = perform_cross_validation(X, y, models)
        if cv_results:
            cv_results_all[target_col] = cv_results

## 8. Model Ensemble and Final Predictions

In [None]:
# Create ensemble models
def create_ensemble_model(models_dict, X, y):
    """
    Create simple ensemble using best performing models
    """
    # Get predictions from all models
    predictions = []
    model_names = []
    
    for name, model in models_dict.items():
        try:
            pred = model.predict(X)
            predictions.append(pred)
            model_names.append(name)
        except Exception as e:
            print(f"⚠️ Failed to get predictions from {name}: {e}")
    
    if len(predictions) >= 2:
        # Simple average ensemble
        ensemble_pred = np.mean(predictions, axis=0)
        
        # Weighted ensemble (weight by R² score)
        weights = []
        for name in model_names:
            try:
                pred = model.predict(X)
                r2 = r2_score(y, pred)
                weights.append(max(0, r2))  # Use 0 for negative R²
            except:
                weights.append(0)
        
        if sum(weights) > 0:
            weights = np.array(weights) / sum(weights)
            weighted_ensemble_pred = np.average(predictions, axis=0, weights=weights)
        else:
            weighted_ensemble_pred = ensemble_pred
        
        # Evaluate ensemble
        simple_r2 = r2_score(y, ensemble_pred)
        weighted_r2 = r2_score(y, weighted_ensemble_pred)
        
        print(f"🔗 Ensemble Results:")
        print(f"   Simple Average: R² = {simple_r2:.3f}")
        print(f"   Weighted Average: R² = {weighted_r2:.3f}")
        print(f"   Models in ensemble: {len(model_names)}")
        
        return {
            'simple_ensemble': ensemble_pred,
            'weighted_ensemble': weighted_ensemble_pred,
            'simple_r2': simple_r2,
            'weighted_r2': weighted_r2,
            'model_names': model_names,
            'weights': weights
        }
    
    return None

# Create ensembles for each target
ensemble_results = {}

for target_col, models in ml_models.items():
    if target_col in modeling_data:
        target_name = modeling_data[target_col]['name']
        X = modeling_data[target_col]['X']
        y = modeling_data[target_col]['y']
        
        print(f"\n🔗 Creating Ensemble for {target_name}")
        print("=" * 50)
        
        ensemble = create_ensemble_model(models, X, y)
        if ensemble:
            ensemble_results[target_col] = ensemble

## 9. Save Models and Results

In [None]:
# Save trained models and results
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

results_dir = Path('../results')
results_dir.mkdir(exist_ok=True)

print("💾 SAVING MODELS AND RESULTS")
print("=" * 50)

# Save ML models
for target_col, models in ml_models.items():
    target_dir = models_dir / target_col.replace('_', '')
    target_dir.mkdir(exist_ok=True)
    
    for model_name, model in models.items():
        model_path = target_dir / f'{model_name}.joblib'
        joblib.dump(model, model_path)
        print(f"   ✅ Saved {model_name} for {target_col}")

# Save model comparison results
for target_col, comparison in model_comparisons.items():
    comparison_path = results_dir / f'model_comparison_{target_col}.csv'
    comparison.to_csv(comparison_path, index=False)
    print(f"   📊 Saved comparison for {target_col}")

# Save cross-validation results
cv_summary = []
for target_col, cv_data in cv_results_all.items():
    for model_name, scores in cv_data.items():
        cv_summary.append({
            'target': target_col,
            'model': model_name,
            'cv_mean': scores['mean'],
            'cv_std': scores['std'],
            'cv_min': scores['min'],
            'cv_max': scores['max']
        })

if cv_summary:
    cv_df = pd.DataFrame(cv_summary)
    cv_df.to_csv(results_dir / 'cross_validation_results.csv', index=False)
    print(f"   📊 Saved cross-validation results")

# Save ensemble results
ensemble_summary = []
for target_col, ensemble in ensemble_results.items():
    ensemble_summary.append({
        'target': target_col,
        'simple_ensemble_r2': ensemble['simple_r2'],
        'weighted_ensemble_r2': ensemble['weighted_r2'],
        'models_count': len(ensemble['model_names']),
        'models_used': ', '.join(ensemble['model_names'])
    })

if ensemble_summary:
    ensemble_df = pd.DataFrame(ensemble_summary)
    ensemble_df.to_csv(results_dir / 'ensemble_results.csv', index=False)
    print(f"   🔗 Saved ensemble results")

# Create model development summary
print(f"\n📋 MODEL DEVELOPMENT SUMMARY:")
print(f"   ✅ Targets modeled: {len(ml_models)}")
print(f"   ✅ Total models trained: {sum(len(models) for models in ml_models.values())}")
print(f"   ✅ Time series models: {len(ts_models)}")
print(f"   ✅ Ensemble models: {len(ensemble_results)}")
print(f"   ✅ Cross-validation completed: {len(cv_results_all)} targets")

print("\n🚀 NEXT STEPS:")
print("   1. Move to 06_predictions.ipynb for forecasting")
print("   2. Use best performing models for future predictions")
print("   3. Generate confidence intervals and scenarios")

print("\n✅ Model Development Phase Complete!")