In [27]:
import utils as utils
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from builtins import min as python_min

In [7]:
spark = utils.create_context()

In [8]:
# Load weather data
#print("  Loading weather data...")
#df_weather = utils.read_iceberg_table( spark, "trusted", "aemetTrustedDiario")
#
#weather_count = df_weather.count()
#print(f"    Weather records: {weather_count}")

# Load hotel occupancy data
print("  Loading hotel occupancy data...")
df_hotels = utils.read_iceberg_table(spark=spark, db_name="exploitation", table_name="f_ocupacion_barcelona")
hotel_count = df_hotels.count()
print(f"    Hotel records: {hotel_count}")

print("✅ Data loaded successfully")

  Loading hotel occupancy data...
    Hotel records: 265
✅ Data loaded successfully


In [15]:
def create_hotel_features(df_hotels):
        """Create hotel-based features."""
        print("  Creating hotel features...")
        
        df_hotel_features = df_hotels.groupBy(
            col('año'),
            col('mes')
        ).agg(
            sum('viajeros').alias('hotel_viajeros'),
            sum('pernoctaciones').alias('hotel_pernoctaciones'),
            avg('estanciaMedia').alias('hotel_estancia_media'),
            avg('gradoOcupaPlazas').alias('avg_ocupacion')
        ).withColumn(
            # Hotel availability score
            'hotel_availability_score',
            100 - col('avg_ocupacion')
        )
        
        return df_hotel_features

In [16]:
df_hotel_feat = create_hotel_features(df_hotels)

  Creating hotel features...


In [17]:
df_hotel_feat = (
    df_hotel_feat
    .withColumn(
            # Seasonal score
            'seasonal_score',
            when((col('mes').isin([6, 7, 8])), 90)  # Summer
            .when((col('mes').isin([4, 5, 9, 10])), 85)  # Spring/Fall
            .when((col('mes').isin([11, 12, 1, 2, 3])), 60)  # Winter
            .otherwise(70)
        )
    .withColumn(
            'visit_quality_score',
            (coalesce(col('hotel_availability_score'), lit(75)) * 0.1)
        )
)

df_final = (
    df_hotel_feat
    .na.drop()

)
final_count = df_final.count()

In [25]:
train_data, test_data = df_final.randomSplit([0.8, 0.2], seed=42)
print(f"  Training data: {train_data.count()} rows")
print(f"  Test data: {test_data.count()} rows")

optional_features = [
    'total_viajeros', 'total_pernoctaciones', 'avg_estancia_media',
    'hotel_viajeros', 'hotel_pernoctaciones', 'avg_ocupacion'
]

# Check which optional features are available
available_features = []
for feat in optional_features:
    if feat in df_final.columns:
        available_features.append(feat)

feature_names = available_features
def create_models(feature_names) -> dict:

    """Create ML models."""
    print("\n🤖 Creating ML models...")
    
    # Prepare features for ML
    assembler = VectorAssembler(
        inputCols=feature_names,
        outputCol='features'
    )
    
    scaler = StandardScaler(
        inputCol='features',
        outputCol='scaled_features'
    )
    
    # Create models
    lr = LinearRegression(
        featuresCol='scaled_features',
        labelCol='visit_quality_score',
        predictionCol='prediction'
    )
    
    rf = RandomForestRegressor(
        featuresCol='scaled_features',
        labelCol='visit_quality_score',
        predictionCol='prediction',
        numTrees=100,
        seed=42
    )
    
    gbt = GBTRegressor(
        featuresCol='scaled_features',
        labelCol='visit_quality_score',
        predictionCol='prediction',
        maxIter=100,
        seed=42
    )
    
    models = {
        'LinearRegression': Pipeline(stages=[assembler, scaler, lr]),
        'RandomForest': Pipeline(stages=[assembler, scaler, rf]),
        'GradientBoosting': Pipeline(stages=[assembler, scaler, gbt])
    }
    
    print(f"  Created {len(models)} models: {list(models.keys())}")
    print("✅ Models created")
    return models

  Training data: 133 rows
  Test data: 25 rows


In [28]:
def train_and_evaluate(models, train_data, test_data):
    """Train and evaluate all models."""
    print("\n🏋️ Training and evaluating models...")
    
    # Initialize results dictionary
    results = {}
    
    evaluator = RegressionEvaluator(
        labelCol='visit_quality_score',
        predictionCol='prediction'
    )
    
    for name, model in models.items():
        print(f"\n  Training {name}...")
        
        try:
            # Train model
            model_fitted = model.fit(train_data)
            
            # Make predictions
            predictions = model_fitted.transform(test_data)
            
            # Evaluate
            rmse = evaluator.evaluate(predictions, {evaluator.metricName: 'rmse'})
            mae = evaluator.evaluate(predictions, {evaluator.metricName: 'mae'})
            r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})
            
            results[name] = {
                'RMSE': rmse,
                'MAE': mae,
                'R2': r2,
                'model': model_fitted,
                'predictions': predictions
            }
            
            print(f"    RMSE: {rmse:.3f}")
            print(f"    MAE: {mae:.3f}")
            print(f"    R²: {r2:.3f}")
            
        except Exception as e:
            print(f"    ❌ Error training {name}: {e}")
            continue
    
    # Find best model
    best_model_name = None
    best_model = None
    
    if results:
        best_model_name = python_min(results.keys(), key=lambda x: results[x]['RMSE'])
        best_model = results[best_model_name]['model']
        
        print(f"\n🏆 Best model: {best_model_name}")
        print(f"   RMSE: {results[best_model_name]['RMSE']:.3f}")
        print(f"   R²: {results[best_model_name]['R2']:.3f}")
    
    print("✅ Training and evaluation completed")
    
    return results, best_model_name, best_model

In [32]:
def analyze_results(results, best_model_name, best_model, feature_names):
    """Analyze and visualize results."""
    import pandas as pd
    import os
    
    print("\n📈 Analyzing results...")
    
    if not results:
        print("❌ No results to analyze")
        return
    
    # Feature importance analysis (for tree-based models)
    if best_model_name in ['Random Forest', 'Gradient Boosting']:
        print("\n🔍 Feature Importance Analysis:")
        try:
            feature_importance = best_model.stages[-1].featureImportances.toArray()
            
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': feature_importance
            }).sort_values('importance', ascending=False)
            
            print("\nTop 10 Most Important Features:")
            print(importance_df.head(10).to_string(index=False))
            
            # Create output directory if it doesn't exist
            os.makedirs('ml_models', exist_ok=True)
            
            # Save feature importance
            importance_df.to_csv('ml_models/feature_importance.csv', index=False)
            print("\n💾 Feature importance saved to feature_importance.csv")
            
            # Return for further analysis in notebook
            feature_importance_result = importance_df
            
        except Exception as e:
            print(f"❌ Error analyzing feature importance: {e}")
            feature_importance_result = None
    else:
        feature_importance_result = None
    
    # Prediction analysis
    prediction_stats = None
    pred_df = None
    
    try:
        best_predictions = results[best_model_name]['predictions']
        
        # Convert to Pandas for analysis
        pred_df = best_predictions.select(
             'visit_quality_score', 'prediction'
        ).toPandas()
        
        print(f"\n📊 Prediction Statistics:")
        print(f"   Mean Actual Score: {pred_df['visit_quality_score'].mean():.2f}")
        print(f"   Mean Predicted Score: {pred_df['prediction'].mean():.2f}")
        print(f"   Std Actual Score: {pred_df['visit_quality_score'].std():.2f}")
        print(f"   Std Predicted Score: {pred_df['prediction'].std():.2f}")
        
        # Create prediction statistics dictionary
        prediction_stats = {
            'mean_actual': pred_df['visit_quality_score'].mean(),
            'mean_predicted': pred_df['prediction'].mean(),
            'std_actual': pred_df['visit_quality_score'].std(),
            'std_predicted': pred_df['prediction'].std()
        }
        
        # Create output directory if it doesn't exist
        os.makedirs('ml_models', exist_ok=True)
        
        # Save predictions
        pred_df.to_csv('ml_models/predictions.csv', index=False)
        print("\n💾 Predictions saved to predictions.csv")
        
    except Exception as e:
        print(f"❌ Error analyzing predictions: {e}")
    
    print("✅ Results analysis completed")
    
    # Return analysis results for further use in notebook
    return {
        'feature_importance': feature_importance_result,
        'prediction_stats': prediction_stats,
        'predictions_df': pred_df
    }

In [33]:
models = create_models(feature_names)
results, best_model_name, best_model = train_and_evaluate(models,train_data,test_data)
analyze_results(results,best_model_name,best_model,feature_names)



🤖 Creating ML models...
  Created 3 models: ['LinearRegression', 'RandomForest', 'GradientBoosting']
✅ Models created

🏋️ Training and evaluating models...

  Training LinearRegression...
    RMSE: 0.000
    MAE: 0.000
    R²: 1.000

  Training RandomForest...
    RMSE: 0.400
    MAE: 0.317
    R²: 0.939

  Training GradientBoosting...
    RMSE: 0.140
    MAE: 0.100
    R²: 0.993

🏆 Best model: LinearRegression
   RMSE: 0.000
   R²: 1.000
✅ Training and evaluation completed

📈 Analyzing results...

📊 Prediction Statistics:
   Mean Actual Score: 4.34
   Mean Predicted Score: 4.34
   Std Actual Score: 1.65
   Std Predicted Score: 1.65

💾 Predictions saved to predictions.csv
✅ Results analysis completed


{'feature_importance': None,
 'prediction_stats': {'mean_actual': np.float64(4.34214),
  'mean_predicted': np.float64(4.3421399999999855),
  'std_actual': np.float64(1.652334236012799),
  'std_predicted': np.float64(1.6523342360128401)},
 'predictions_df':     visit_quality_score  prediction
 0                3.9640      3.9640
 1                1.7320      1.7320
 2                2.5180      2.5180
 3                4.9720      4.9720
 4                1.6570      1.6570
 5                5.5180      5.5180
 6                2.7390      2.7390
 7                5.2410      5.2410
 8                2.5170      2.5170
 9                4.0920      4.0920
 10               5.1650      5.1650
 11               5.4135      5.4135
 12               4.0350      4.0350
 13               1.5915      1.5915
 14               4.5525      4.5525
 15               3.6025      3.6025
 16               2.7225      2.7225
 17               6.6720      6.6720
 18               3.4300      3.4300
 19 