In [None]:
#!/usr/bin/env python3
#cell 1 imports and setup
"""

UBER FARE PREDICTION SYSTEM

Principal Objective: Accurate Fare Prediction using Integrated ML Approach


CRISP-DM PHASES IMPLEMENTED:

1. Business Understanding - Fare prediction for pricing optimization

2. Data Understanding - Explore and analyze Uber trip data

3. Data Preparation - Clean, transform, and engineer features

4. Modeling - Train Random Forest with clustering features

5. Evaluation - Validate model performance and accuracy

6. Deployment - Create prediction interface and visualizations


BUSINESS OBJECTIVES (BOS):

1. Client Segmentation & Profiling - Identify distinct client groups based on ride behavior patterns

2. Cluster Visualization & PCA Analysis - Visualize client segments and reduce dimensionality for interpretability

3. Fare Prediction - Accurate prediction of trip fares using machine learning

"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Nouvelle section

In [None]:
#cell 2 business understanding
class UberFarePredictor:
    """
    UBER FARE PREDICTION SYSTEM
    Principal Objective: Predict fare amounts accurately using trip characteristics

    CRISP-DM BUSINESS UNDERSTANDING:
    - BOS-01: Client Segmentation & Profiling - Identify distinct client groups
    - BOS-02: Cluster Visualization & PCA Analysis - Visualize segments effectively
    - BOS-03: Fare Prediction - Accurate prediction of trip fares
    """

    def __init__(self):
        # BOS-03: Core prediction model and preprocessing for fare prediction
        self.model = None  # Random Forest regressor for fare prediction
        self.scaler = StandardScaler()  # Feature scaler for model performance
        self.feature_names = []  # Selected features for prediction
        self.is_trained = False  # Model training status

        # BOS-01 & BOS-02: Clustering components for client segmentation and visualization
        self.kmeans = None  # K-means for client segmentation (BOS-01)
        self.pca = None  # PCA for dimensionality reduction and visualization (BOS-02)
        self.cluster_scaler = StandardScaler()  # Scaler for clustering features

        # Performance tracking for BOS-03: Fare Prediction
        self.metrics = {}  # Model performance metrics
        self.cv_scores = {}  # Cross-validation results

In [None]:
#cell 3 data undertanding and preparation
def load_and_process_data(self, path='datauber.csv'):
    """
    CRISP-DM PHASE 2 & 3: DATA UNDERSTANDING & DATA PREPARATION
    Objective: Load, explore, and preprocess Uber trip data

    Supports all BOS:
    - BOS-01: Prepares features for client segmentation
    - BOS-02: Creates data for cluster visualization
    - BOS-03: Engineers features for fare prediction
    """
    print("CRISP-DM PHASE 2/3: Data Understanding & Preparation...")
    df = pd.read_csv(path)

    # DATA CLEANING: Remove invalid records
    required_cols = ['pickup_latitude', 'pickup_longitude',
                    'dropoff_latitude', 'dropoff_longitude',
                    'fare_amount', 'pickup_datetime', 'passenger_count']
    df = df.dropna(subset=required_cols)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

    # BOS-03: TARGET VARIABLE - Clean fare amount for prediction
    df['fare_amount'] = pd.to_numeric(df['fare_amount'], errors='coerce')
    df = df[(df['fare_amount'] > 0) & (df['fare_amount'] <= 200)]

    # FEATURE ENGINEERING for all BOS
    def haversine_distance(lat1, lon1, lat2, lon2):
        """Calculate great-circle distance - primary feature for all BOS"""
        R = 6371
        lat1_rad, lon1_rad = np.radians(lat1), np.radians(lon1)
        lat2_rad, lon2_rad = np.radians(lat2), np.radians(lon2)
        dlat, dlon = lat2_rad - lat1_rad, lon2_rad - lon1_rad
        a = np.sin(dlat/2)**2 + np.cos(lat1_rad)*np.cos(lat2_rad)*np.sin(dlon/2)**2
        return R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    df['distance_km'] = df.apply(lambda row: haversine_distance(
        row['pickup_latitude'], row['pickup_longitude'],
        row['dropoff_latitude'], row['dropoff_longitude']
    ), axis=1)
    df = df[(df['distance_km'] >= 0.1) & (df['distance_km'] <= 100)]

    # BOS-03: Additional geographic feature for fare prediction
    df['manhattan_distance'] = (abs(df['pickup_latitude'] - df['dropoff_latitude']) +
                               abs(df['pickup_longitude'] - df['dropoff_longitude']))

    # BOS-01 & BOS-03: Temporal features for segmentation and prediction
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
    df['month'] = df['pickup_datetime'].dt.month

    # BOS-01 & BOS-03: Behavioral flags for segmentation and fare patterns
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_rush_hour'] = df['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)
    df['is_night'] = df['hour'].isin([22, 23, 0, 1, 2, 3, 4, 5]).astype(int)
    df['is_late_night'] = df['hour'].isin([0, 1, 2, 3, 4]).astype(int)

    # BOS-03: Trip characteristics for fare prediction
    df['is_short_trip'] = (df['distance_km'] < 2).astype(int)
    df['is_long_trip'] = (df['distance_km'] > 10).astype(int)

    # Add correlation heatmap
    print("\nVisualizing feature correlations...")
    correlation_matrix = df.corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
    plt.title('Feature Correlation Heatmap')
    plt.show()
    print("‚úÖ Correlation heatmap displayed.")


    print(f"‚úÖ Processed {len(df)} trips for all BOS")
    return df

# Add method to class
UberFarePredictor.load_and_process_data = load_and_process_data

In [None]:
#Cell 4: Modeling - Client Segmentation (BOS-01)
def perform_client_segmentation(self, df):
    """
    BOS-01: CLIENT SEGMENTATION & PROFILING
    CRISP-DM PHASE 4: MODELING - Clustering model for client segmentation
    Objective: Identify distinct client groups based on ride behavior patterns
    """
    print("BOS-01: Performing client segmentation...")

    # Features for client segmentation
    cluster_features = ['distance_km', 'hour', 'day_of_week', 'passenger_count']
    X_cluster = df[cluster_features].fillna(0).values
    X_scaled = self.cluster_scaler.fit_transform(X_cluster)

    # BOS-02: PCA for dimensionality reduction and visualization
    self.pca = PCA(n_components=min(3, X_scaled.shape[1]), random_state=42)
    X_pca = self.pca.fit_transform(X_scaled)

    # BOS-01: Optimal cluster determination
    optimal_k = self._find_optimal_clusters(X_pca)
    self.kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    kmeans_labels = self.kmeans.fit_predict(X_pca)

    # Add cluster assignments as features for BOS-03 fare prediction
    df['trip_pattern_cluster'] = kmeans_labels

    print(f"‚úÖ BOS-01: Discovered {optimal_k} client segments")
    return df, kmeans_labels

def _find_optimal_clusters(self, X, max_k=8):
    """
    BOS-01: OPTIMAL CLUSTER SELECTION
    Objective: Find the right number of clusters for meaningful segmentation
    """
    wcss = []
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X)
        wcss.append(kmeans.inertia_)

    differences = np.diff(wcss)
    second_diff = np.diff(differences)
    optimal_k = np.argmax(second_diff) + 3
    return min(optimal_k, 6)

# Add methods to class
UberFarePredictor.perform_client_segmentation = perform_client_segmentation
UberFarePredictor._find_optimal_clusters = _find_optimal_clusters

In [None]:
#Cell 5: Evaluation - Cluster Visualization (BOS-02)
def visualize_clusters(self, df, cluster_labels):
    """
    BOS-02: CLUSTER VISUALIZATION & PCA ANALYSIS
    CRISP-DM PHASE 5: EVALUATION - Visual assessment of clustering results
    Objective: Visualize client segments and reduce dimensionality for interpretability
    """
    print("BOS-02: Creating cluster visualizations...")

    # Prepare features for visualization
    cluster_features = ['distance_km', 'hour', 'day_of_week', 'passenger_count']
    X_cluster = df[cluster_features].fillna(0).values
    X_scaled = self.cluster_scaler.transform(X_cluster)
    X_pca = self.pca.transform(X_scaled)

    # BOS-02: Create comprehensive visualization
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

    # Plot 1: PCA 2D Visualization
    scatter = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels,
                         cmap='viridis', alpha=0.6, s=30)
    ax1.set_xlabel(f'PC1 ({self.pca.explained_variance_ratio_[0]:.2%} variance)')
    ax1.set_ylabel(f'PC2 ({self.pca.explained_variance_ratio_[1]:.2%} variance)')
    ax1.set_title('BOS-02: Client Segments (PCA Visualization)')
    plt.colorbar(scatter, ax=ax1, label='Cluster')

    # Plot 2: Cluster Size Distribution
    unique, counts = np.unique(cluster_labels, return_counts=True)
    bars = ax2.bar(unique, counts, color=plt.cm.viridis(np.linspace(0, 1, len(unique))))
    ax2.set_xlabel('Cluster ID')
    ax2.set_ylabel('Number of Clients')
    ax2.set_title('BOS-02: Cluster Size Distribution')
    for bar, count in zip(bars, counts):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                f'{count}', ha='center', va='bottom')

    # Plot 3: PCA Variance Explained
    explained_variance = self.pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)
    ax3.bar(range(1, len(explained_variance) + 1), explained_variance,
            alpha=0.6, color='steelblue', label='Individual')
    ax3.plot(range(1, len(cumulative_variance) + 1), cumulative_variance,
            'ro-', label='Cumulative')
    ax3.set_xlabel('Principal Component')
    ax3.set_ylabel('Explained Variance Ratio')
    ax3.set_title('BOS-02: PCA Variance Explained')
    ax3.legend()
    ax3.grid(True, alpha=0.3)

    # Plot 4: Cluster Characteristics
    ax4.axis('off')
    sil_score = silhouette_score(X_pca, cluster_labels)
    metrics_text = f"""
    BOS-02: CLUSTERING RESULTS

    Clusters: {len(unique)}
    Total Clients: {len(df):,}
    Silhouette Score: {sil_score:.3f}
    PCA Variance: {cumulative_variance[1]:.2%}

    Cluster Sizes:
    {chr(10).join([f' Cluster {i}: {count}' for i, count in zip(unique, counts)])}
    """
    ax4.text(0.1, 0.5, metrics_text, fontsize=12, family='monospace',
            verticalalignment='center')

    plt.tight_layout()
    plt.savefig('cluster_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("‚úÖ BOS-02: Cluster visualization saved as 'cluster_analysis.png'")

# Add method to class
UberFarePredictor.visualize_clusters = visualize_clusters

In [None]:
#cell 6 modeling fare predicition Bo3
def train_fare_prediction_model(self, df):
    """
    BOS-03: FARE PREDICTION
    CRISP-DM PHASE 4: MODELING - Train predictive model for fare amounts
    Objective: Accurate prediction of trip fares using machine learning
    """
    print("BOS-03: Training fare prediction model...")

    # BOS-01 & BOS-03: First perform client segmentation to enhance features
    df, cluster_labels = self.perform_client_segmentation(df)

    # BOS-02: Visualize the clusters
    self.visualize_clusters(df, cluster_labels)

    # BOS-03: Prepare features for fare prediction
    feature_columns = [
        # Geographic features
        'distance_km', 'manhattan_distance',
        # Temporal features
        'hour', 'day_of_week', 'month',
        'is_weekend', 'is_rush_hour', 'is_night', 'is_late_night',
        # Trip characteristics
        'passenger_count', 'is_short_trip', 'is_long_trip',
        # BOS-01: Cluster features from segmentation
        'trip_pattern_cluster'
    ]

    available_features = [col for col in feature_columns if col in df.columns]
    self.feature_names = available_features

    X = df[available_features].fillna(0)
    y = df['fare_amount']

    # BOS-03: Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # BOS-03: Feature scaling
    X_train_scaled = self.scaler.fit_transform(X_train)
    X_test_scaled = self.scaler.transform(X_test)

    # BOS-03: Initialize and train Random Forest
    self.model = RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        random_state=42,
        n_jobs=-1
    )
    self.model.fit(X_train_scaled, y_train)
    self.is_trained = True

    # BOS-03: Comprehensive model evaluation
    self._evaluate_model(X_train_scaled, y_train, X_test_scaled, y_test)
    self._perform_cross_validation(X_train_scaled, y_train)
    self._analyze_feature_importance()

    return self.model

# Add method to class
UberFarePredictor.train_fare_prediction_model = train_fare_prediction_model

In [None]:
#cell 7 evaluation model performance
def _evaluate_model(self, X_train, y_train, X_test, y_test):
    """
    BOS-03: MODEL PERFORMANCE EVALUATION
    CRISP-DM PHASE 5: EVALUATION - Assess prediction accuracy
    """
    # Training performance
    train_predictions = self.model.predict(X_train)
    train_score = r2_score(y_train, train_predictions)

    # Test performance
    test_predictions = self.model.predict(X_test)
    test_score = r2_score(y_test, test_predictions)

    # Error metrics
    mae = mean_absolute_error(y_test, test_predictions)
    mse = mean_squared_error(y_test, test_predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, test_predictions)

    self.metrics = {
        'train_r2': train_score,
        'test_r2': test_score,
        'mae': mae,
        'mse': mse,
        'rmse': rmse,
        'r2': r2
    }

    print("\n" + "="*50)
    print("BOS-03: FARE PREDICTION PERFORMANCE")
    print("="*50)
    print(f"üìä Training R¬≤: {train_score:.4f}")
    print(f"üìä Test R¬≤: {test_score:.4f}")
    print(f"üí∞ Mean Absolute Error: ${mae:.2f}")
    print(f"üìà Root Mean Squared Error: ${rmse:.2f}")
    print(f"üéØ R¬≤ Score: {r2:.4f}")

    avg_fare = y_test.mean()
    print(f"üí° MAE is {mae/avg_fare*100:.1f}% of average fare (${avg_fare:.2f})")

def _perform_cross_validation(self, X_train, y_train):
    """
    BOS-03: CROSS-VALIDATION FOR RELIABILITY
    CRISP-DM PHASE 5: EVALUATION - Ensure model consistency
    """
    print("\nBOS-03: Performing cross-validation...")

    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    cv_r2 = cross_val_score(self.model, X_train, y_train, cv=cv, scoring='r2')
    cv_neg_mse = cross_val_score(self.model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    cv_neg_mae = cross_val_score(self.model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')

    self.cv_scores = {
        'r2_scores': cv_r2,
        'r2_mean': cv_r2.mean(),
        'r2_std': cv_r2.std(),
        'mse_scores': -cv_neg_mse,
        'mse_mean': -cv_neg_mse.mean(),
        'rmse_mean': np.sqrt(-cv_neg_mse.mean()),
        'mae_scores': -cv_neg_mae,
        'mae_mean': -cv_neg_mae.mean(),
        'mae_std': cv_neg_mae.std()
    }

    print(f"‚úÖ BOS-03: Cross-Validation Results (5-fold):")
    print(f" R¬≤: {self.cv_scores['r2_mean']:.4f} ¬± {self.cv_scores['r2_std']:.4f}")
    print(f" MAE: ${self.cv_scores['mae_mean']:.2f} ¬± ${self.cv_scores['mae_std']:.2f}")
    print(f" RMSE: ${self.cv_scores['rmse_mean']:.2f}")

def _analyze_feature_importance(self):
    """
    BOS-03: FEATURE IMPORTANCE ANALYSIS
    CRISP-DM PHASE 5: EVALUATION - Understand prediction drivers
    """
    importances = self.model.feature_importances_
    feature_importance = sorted(
        zip(self.feature_names, importances),
        key=lambda x: x[1],
        reverse=True
    )

    print("\nüîç BOS-03: TOP 10 FEATURES INFLUENCING FARE PREDICTIONS:")
    print("-" * 50)
    for i, (feat, imp) in enumerate(feature_importance[:10], 1):
        print(f"{i:2d}. {feat:20s}: {imp:.4f}")

# Add methods to class
UberFarePredictor._evaluate_model = _evaluate_model
UberFarePredictor._perform_cross_validation = _perform_cross_validation
UberFarePredictor._analyze_feature_importance = _analyze_feature_importance

In [None]:
#cell 8 deployment prediction interface
def predict_fare(self, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon,
                passenger_count=1, pickup_time=None):
    """
    BOS-03: FARE PREDICTION INTERFACE
    CRISP-DM PHASE 6: DEPLOYMENT - Make predictions for new trips
    """
    if not self.is_trained:
        raise ValueError("Model not trained. Call train_fare_prediction_model first.")

    if pickup_time is None:
        pickup_time = pd.Timestamp.now()

    # Feature engineering for prediction
    features = self._engineer_features_for_prediction(
        pickup_lat, pickup_lon, dropoff_lat, dropoff_lon,
        passenger_count, pickup_time
    )

    # Make prediction
    feature_array = np.array([features.get(col, 0) for col in self.feature_names]).reshape(1, -1)
    feature_scaled = self.scaler.transform(feature_array)
    predicted_fare = self.model.predict(feature_scaled)[0]

    # Apply business rules
    final_fare = self._apply_business_rules(predicted_fare, features)

    return {
        'predicted_fare': final_fare,
        'distance_km': round(features['distance_km'], 2),
        'base_prediction': round(predicted_fare, 2),
        'surge_multiplier': self._calculate_surge_multiplier(features),
        'time_of_day': f"{features['hour']:02d}:00",
        'day_type': 'Weekend' if features['is_weekend'] else 'Weekday',
        'demand_period': self._get_demand_period(features),
        'trip_pattern': features['trip_pattern_cluster']
    }

def _engineer_features_for_prediction(self, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon,
                                    passenger_count, pickup_time):
    """Feature engineering for single prediction"""
    def haversine_distance(lat1, lon1, lat2, lon2):
        R = 6371
        lat1_rad, lon1_rad = np.radians(lat1), np.radians(lon1)
        lat2_rad, lon2_rad = np.radians(lat2), np.radians(lon2)
        dlat, dlon = lat2_rad - lat1_rad, lon2_rad - lon1_rad
        a = np.sin(dlat/2)**2 + np.cos(lat1_rad)*np.cos(lat2_rad)*np.sin(dlon/2)**2
        return R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    distance = haversine_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)
    manhattan_dist = abs(pickup_lat - dropoff_lat) + abs(pickup_lon - dropoff_lon)

    hour = pickup_time.hour
    day_of_week = pickup_time.weekday()
    month = pickup_time.month

    is_weekend = 1 if day_of_week >= 5 else 0
    is_rush_hour = 1 if hour in [7, 8, 9, 17, 18, 19] else 0
    is_night = 1 if hour in [22, 23, 0, 1, 2, 3, 4, 5] else 0
    is_late_night = 1 if hour in [0, 1, 2, 3, 4] else 0

    is_short_trip = 1 if distance < 2 else 0
    is_long_trip = 1 if distance > 10 else 0

    # BOS-01: Predict cluster for this trip
    cluster_features = np.array([[distance, hour, day_of_week, passenger_count]])
    cluster_scaled = self.cluster_scaler.transform(cluster_features)
    cluster_pca = self.pca.transform(cluster_scaled)
    trip_pattern_cluster = self.kmeans.predict(cluster_pca)[0]

    features = {
        'distance_km': distance,
        'manhattan_distance': manhattan_dist,
        'hour': hour,
        'day_of_week': day_of_week,
        'month': month,
        'is_weekend': is_weekend,
        'is_rush_hour': is_rush_hour,
        'is_night': is_night,
        'is_late_night': is_late_night,
        'passenger_count': passenger_count,
        'is_short_trip': is_short_trip,
        'is_long_trip': is_long_trip,
        'trip_pattern_cluster': trip_pattern_cluster
    }

    return features

def _apply_business_rules(self, predicted_fare, features):
    """Apply business rules to predicted fare"""
    final_fare = predicted_fare
    final_fare = max(final_fare, 3.50)  # Minimum fare

    if features['is_short_trip']:
        final_fare = max(final_fare, 7.00)

    if features['is_late_night']:
        final_fare *= 1.30
    elif features['is_night']:
        final_fare *= 1.15

    if features['is_rush_hour']:
        final_fare *= 1.10

    if features['is_weekend']:
        final_fare *= 1.05

    return round(final_fare, 2)

def _calculate_surge_multiplier(self, features):
    """Calculate surge multiplier"""
    multiplier = 1.0
    if features['is_late_night']: multiplier *= 1.30
    elif features['is_night']: multiplier *= 1.15
    if features['is_rush_hour']: multiplier *= 1.10
    if features['is_weekend']: multiplier *= 1.05
    return round(multiplier, 2)

def _get_demand_period(self, features):
    """Classify demand period"""
    if features['is_late_night']: return "Late Night (High Demand)"
    elif features['is_night']: return "Night (Medium Demand)"
    elif features['is_rush_hour']: return "Rush Hour (High Demand)"
    else: return "Off-Peak (Low Demand)"

# Add methods to class
UberFarePredictor.predict_fare = predict_fare
UberFarePredictor._engineer_features_for_prediction = _engineer_features_for_prediction
UberFarePredictor._apply_business_rules = _apply_business_rules
UberFarePredictor._calculate_surge_multiplier = _calculate_surge_multiplier
UberFarePredictor._get_demand_period = _get_demand_period

In [None]:
#cell 9 deployment performance dashboard
def create_performance_dashboard(self, df):
    """
    BOS-03: PERFORMANCE VISUALIZATION
    CRISP-DM PHASE 6: DEPLOYMENT - Model performance dashboard
    """
    print("BOS-03: Creating performance visualization dashboard...")

    X, y = self.prepare_prediction_features(df)
    X_scaled = self.scaler.transform(X)
    predictions = self.model.predict(X_scaled)

    fig = plt.figure(figsize=(20, 12))

    # 1. Actual vs Predicted
    plt.subplot(2, 3, 1)
    plt.scatter(y, predictions, alpha=0.5, s=20)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
    plt.xlabel('Actual Fare ($)')
    plt.ylabel('Predicted Fare ($)')
    plt.title('BOS-03: Actual vs Predicted Fares')
    plt.grid(True, alpha=0.3)

    # 2. Residual Analysis
    plt.subplot(2, 3, 2)
    residuals = y - predictions
    plt.scatter(predictions, residuals, alpha=0.5, s=20)
    plt.axhline(y=0, color='r', linestyle='--', lw=2)
    plt.xlabel('Predicted Fare ($)')
    plt.ylabel('Residuals ($)')
    plt.title('BOS-03: Residual Plot')
    plt.grid(True, alpha=0.3)

    # 3. Error Distribution
    plt.subplot(2, 3, 3)
    plt.hist(residuals, bins=50, edgecolor='black', alpha=0.7)
    plt.axvline(x=0, color='r', linestyle='--', lw=2)
    plt.xlabel('Prediction Error ($)')
    plt.ylabel('Frequency')
    plt.title('BOS-03: Prediction Error Distribution')
    plt.grid(True, alpha=0.3)

    # 4. Feature Importance
    plt.subplot(2, 3, 4)
    importances = self.model.feature_importances_
    feature_importance = sorted(zip(self.feature_names, importances),
                              key=lambda x: x[1], reverse=True)
    features, imp_values = zip(*feature_importance[:10])
    y_pos = np.arange(len(features))
    plt.barh(y_pos, imp_values, color='steelblue', edgecolor='black')
    plt.yticks(y_pos, features, fontsize=9)
    plt.xlabel('Importance')
    plt.title('BOS-03: Top 10 Feature Importances')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)

    # 5. Cross-Validation Performance
    plt.subplot(2, 3, 5)
    cv_metrics = ['R¬≤', 'MAE', 'RMSE']
    cv_means = [
        self.cv_scores['r2_mean'],
        self.cv_scores['mae_mean'],
        self.cv_scores['rmse_mean']
    ]
    x_pos = np.arange(len(cv_metrics))
    plt.bar(x_pos, cv_means, color=['green', 'orange', 'red'],
           alpha=0.7, edgecolor='black')
    plt.xticks(x_pos, cv_metrics)
    plt.ylabel('Score')
    plt.title('BOS-03: Cross-Validation Performance')
    plt.grid(True, alpha=0.3)

    # 6. Performance Summary
    plt.subplot(2, 3, 6)
    plt.axis('off')
    summary_text = f"""
    BOS-03: FARE PREDICTION PERFORMANCE

    Model: Random Forest Regressor
    Samples: {len(df):,} trips
    Features: {len(self.feature_names)}

    Accuracy Metrics:
    ‚Ä¢ R¬≤ Score: {self.metrics['r2']:.4f}
    ‚Ä¢ MAE: ${self.metrics['mae']:.2f}
    ‚Ä¢ RMSE: ${self.metrics['rmse']:.2f}

    Cross-Validation:
    ‚Ä¢ R¬≤: {self.cv_scores['r2_mean']:.4f} ¬± {self.cv_scores['r2_std']:.4f}
    ‚Ä¢ MAE: ${self.cv_scores['mae_mean']:.2f}
    """
    plt.text(0.1, 0.5, summary_text, fontsize=11, family='monospace',
            verticalalignment='center')

    plt.tight_layout()
    plt.savefig('fare_prediction_performance.png', dpi=300, bbox_inches='tight')
    plt.show()

    print("‚úÖ BOS-03: Performance dashboard saved as 'fare_prediction_performance.png'")

def prepare_prediction_features(self, df):
    """Prepare features for prediction (helper method)"""
    feature_columns = [
        'distance_km', 'manhattan_distance', 'hour', 'day_of_week', 'month',
        'is_weekend', 'is_rush_hour', 'is_night', 'is_late_night',
        'passenger_count', 'is_short_trip', 'is_long_trip', 'trip_pattern_cluster'
    ]
    available_features = [col for col in feature_columns if col in df.columns]
    X = df[available_features].fillna(0)
    y = df['fare_amount']
    return X, y

# Add methods to class
UberFarePredictor.create_performance_dashboard = create_performance_dashboard
UberFarePredictor.prepare_prediction_features = prepare_prediction_features

In [None]:
#cell 10 complete system execution
def run_complete_system(self, data_path='datauber.csv'):
    """
    COMPLETE SYSTEM EXECUTION
    CRISP-DM END-TO-END IMPLEMENTATION
    Executes all BOS in integrated workflow
    """
    print("üöï UBER FARE PREDICTION SYSTEM")
    print("="*50)
    print("EXECUTING ALL BUSINESS OBJECTIVES:")
    print("BOS-01: Client Segmentation & Profiling")
    print("BOS-02: Cluster Visualization & PCA Analysis")
    print("BOS-03: Fare Prediction - Principal Objective")
    print("="*50)

    # CRISP-DM PHASE 2/3: Data Understanding & Preparation
    df = self.load_and_process_data(data_path)

    # CRISP-DM PHASE 4/5/6: Modeling, Evaluation & Deployment for all BOS
    model = self.train_fare_prediction_model(df)

    # BOS-03: Performance Visualization
    self.create_performance_dashboard(df)

    # SYSTEM SUMMARY
    print("\n" + "="*60)
    print("SYSTEM EXECUTION COMPLETE - ALL BOS ACHIEVED")
    print("="*60)
    print("üìä BOS-01: Client Segmentation - COMPLETE")
    print(" ‚Ä¢ Identified distinct client segments")
    print(" ‚Ä¢ Created trip pattern clusters")

    print("üé® BOS-02: Cluster Visualization - COMPLETE")
    print(" ‚Ä¢ PCA dimensionality reduction applied")
    print(" ‚Ä¢ Cluster visualization created")
    print(" ‚Ä¢ Saved as 'cluster_analysis.png'")

    print("üí∞ BOS-03: Fare Prediction - COMPLETE")
    print(f" ‚Ä¢ Model R¬≤ Score: {self.metrics['r2']:.4f}")
    print(f" ‚Ä¢ Prediction MAE: ${self.metrics['mae']:.2f}")
    print(f" ‚Ä¢ Performance dashboard saved")

    print("\nüöÄ SYSTEM READY FOR FARE PREDICTIONS!")
    return model

# Add method to class
UberFarePredictor.run_complete_system = run_complete_system

In [None]:
#cell 11 main execution
def main():
    """
    MAIN EXECUTION FUNCTION
    Orchestrates the complete Uber Fare Prediction System
    """
    predictor = UberFarePredictor()

    try:
        model = predictor.run_complete_system()

        # DEMONSTRATE PREDICTION CAPABILITY
        print("\nüéØ DEMO: Sample Fare Prediction")
        print("From: Times Square, NYC")
        print("To: Central Park, NYC")
        print("Passengers: 2")

        # Sample coordinates for demonstration
        sample_prediction = predictor.predict_fare(
            pickup_lat=40.7580, pickup_lon=-73.9855,  # Times Square
            dropoff_lat=40.7829, dropoff_lon=-73.9654,  # Central Park
            passenger_count=2
        )

        print(f"\nüí∞ Predicted Fare: ${sample_prediction['predicted_fare']}")
        print(f"üìè Distance: {sample_prediction['distance_km']} km")
        print(f"‚è∞ Time: {sample_prediction['time_of_day']} ({sample_prediction['day_type']})")
        print(f"üìä Demand: {sample_prediction['demand_period']}")
        print(f"üéØ Surge Multiplier: {sample_prediction['surge_multiplier']}x")

    except Exception as e:
        print(f"‚ùå Error during system execution: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

üöï UBER FARE PREDICTION SYSTEM
EXECUTING ALL BUSINESS OBJECTIVES:
BOS-01: Client Segmentation & Profiling
BOS-02: Cluster Visualization & PCA Analysis
BOS-03: Fare Prediction - Principal Objective
CRISP-DM PHASE 2/3: Data Understanding & Preparation...
‚ùå Error during system execution: [Errno 2] No such file or directory: 'datauber.csv'


Traceback (most recent call last):
  File "/tmp/ipython-input-2285623710.py", line 10, in main
    model = predictor.run_complete_system()
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-932171530.py", line 17, in run_complete_system
    df = self.load_and_process_data(data_path)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-1752515884.py", line 13, in load_and_process_data
    df = pd.read_csv(path)
         ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 1620, in __init__
    self._engine =