In [None]:
# %% [markdown]
# # Vehicle Telemetry Analytics - Advanced Analysis & Insights
#
# ## Executive Summary
# This notebook performs comprehensive analysis on engineered features to extract actionable insights, build predictive models, and generate business recommendations for vehicle fleet optimization.
#
# ## Key Objectives
# 1. Predictive Modeling for Maintenance
# 2. Fuel Efficiency Analysis
# 3. Driver Behavior Scoring
# 4. Anomaly Detection at Scale
# 5. Business Insights Generation
# 6. Dashboard Preparation
#
# ## Technologies Used
# - XGBoost, LightGBM, CatBoost for predictive modeling
# - SHAP, LIME for model interpretation
# - Prophet for time series forecasting
# - MLflow for experiment tracking
# - Streamlit/Tableau dashboard preparation

# %% [code]
# Install required packages
!pip install pandas numpy scikit-learn xgboost lightgbm catboost shap lime optuna mlflow prophet -q
!pip install plotly dash streamlit pycaret -q

# %% [code]
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                           roc_auc_score, confusion_matrix, classification_report,
                           mean_squared_error, mean_absolute_error, r2_score)
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier,
                            IsolationForest, VotingClassifier, StackingClassifier)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN

# Advanced Models
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from prophet import Prophet

# Model Interpretation
import shap
import lime
import lime.lime_tabular

# Time Series
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# Experiment Tracking
import mlflow
import mlflow.sklearn

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("All libraries imported successfully!")

# %% [code]
# Load engineered features
try:
    telemetry_df = pd.read_csv('engineered_features/telemetry_engineered.csv')
    print(f"‚úÖ Loaded engineered dataset: {telemetry_df.shape}")

    # Load feature importance
    import json
    with open('engineered_features/feature_importance.json', 'r') as f:
        feature_importance = json.load(f)

    print("‚úÖ Loaded feature importance metadata")

except:
    print("‚ö†Ô∏è No engineered features found. Creating sample dataset...")

    # Create comprehensive sample dataset
    np.random.seed(42)
    n_samples = 10000

    telemetry_df = pd.DataFrame({
        'vehicle_id': np.random.choice([f'VH{str(i).zfill(3)}' for i in range(1, 21)], n_samples),
        'timestamp': pd.date_range('2024-01-01', periods=n_samples, freq='1min'),
        'speed_kmh': np.random.gamma(shape=2, scale=15, size=n_samples) + 20,
        'engine_rpm': np.random.normal(2500, 500, n_samples),
        'fuel_consumption_lph': np.random.exponential(5, n_samples) + 3,
        'engine_temp_c': np.random.normal(90, 5, n_samples),
        'oil_temp_c': np.random.normal(85, 3, n_samples),
        'coolant_temp_c': np.random.normal(88, 4, n_samples),
        'battery_voltage': np.random.normal(12.5, 0.5, n_samples),
        'throttle_position': np.random.uniform(0, 100, n_samples),
        'brake_pressure': np.random.exponential(10, n_samples),
        'vehicle_load_kg': np.random.choice([1000, 1500, 2000, 2500], n_samples, p=[0.3, 0.4, 0.2, 0.1]),
        'fuel_level': np.random.uniform(10, 100, n_samples),
        'hour': np.random.randint(0, 24, n_samples),
        'day_of_week': np.random.randint(0, 7, n_samples),
        'is_weekend': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]),
        'is_business_hours': np.random.choice([0, 1], n_samples, p=[0.4, 0.6]),
        'engine_stress_score': np.random.uniform(0, 1, n_samples),
        'battery_health_score': np.random.uniform(0.5, 1, n_samples),
        'instant_fuel_efficiency': np.random.uniform(5, 25, n_samples),
        'overheating_risk': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
        'low_battery_warning': np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        'speeding_indicator': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'vehicle_health_score': np.random.uniform(0.6, 1, n_samples),
        'maintenance_required': np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
        'target_failure': np.random.choice([0, 1], n_samples, p=[0.9, 0.1])
    })

    print(f"üìä Created sample dataset: {telemetry_df.shape}")

print(f"\nüìã Dataset Columns: {list(telemetry_df.columns)}")

# %% [code]
# Advanced Predictive Modeling for Maintenance
class MaintenancePredictor:
    """
    Advanced predictive maintenance modeling system
    """
    def __init__(self):
        self.models = {}
        self.results = {}
        self.feature_importance = {}

    def prepare_data(self, df, target_col='maintenance_required', test_size=0.2):
        """
        Prepare data for modeling
        """
        print("üìä Preparing data for modeling...")

        # Separate features and target
        if target_col not in df.columns:
            print(f"‚ö†Ô∏è Target column '{target_col}' not found. Using 'target_failure'")
            target_col = 'target_failure'

        X = df.drop(columns=[target_col, 'vehicle_id', 'timestamp']
                   if 'vehicle_id' in df.columns and 'timestamp' in df.columns
                   else [target_col])
        y = df[target_col]

        # Handle missing values
        X = X.fillna(X.mean())

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )

        print(f"  Training set: {X_train.shape}")
        print(f"  Test set: {X_test.shape}")
        print(f"  Class distribution - Train: {y_train.value_counts().to_dict()}")
        print(f"  Class distribution - Test: {y_test.value_counts().to_dict()}")

        return X_train, X_test, y_train, y_test, X.columns.tolist()

    def train_models(self, X_train, X_test, y_train, y_test, feature_names):
        """
        Train multiple advanced models
        """
        print("\nü§ñ Training advanced models...")

        models_to_train = {
            'Random Forest': RandomForestClassifier(
                n_estimators=200,
                max_depth=10,
                min_samples_split=5,
                min_samples_leaf=2,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            ),
            'XGBoost': xgb.XGBClassifier(
                n_estimators=200,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                use_label_encoder=False,
                eval_metric='logloss'
            ),
            'LightGBM': lgb.LGBMClassifier(
                n_estimators=200,
                max_depth=6,
                learning_rate=0.1,
                num_leaves=31,
                random_state=42,
                class_weight='balanced'
            ),
            'CatBoost': cb.CatBoostClassifier(
                iterations=200,
                depth=6,
                learning_rate=0.1,
                random_state=42,
                verbose=0
            ),
            'Gradient Boosting': GradientBoostingClassifier(
                n_estimators=200,
                max_depth=5,
