In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

# Set styling for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# Define a function to load and clean the data
def load_and_clean_data(file_path):
    """
    Load and clean the Uber driver signup data.
    
    Parameters:
    file_path (str): Path to the CSV file
    
    Returns:
    pd.DataFrame: Cleaned dataframe
    """
    # Read the data
    df = pd.read_csv(file_path, na_values="NA")
    
    # Check basic info
    print(f"Data shape: {df.shape}")
    print(f"Missing values:\n{df.isnull().sum()}")
    
    # Convert date columns to datetime
    date_columns = ['signup_date', 'bgc_date', 'vehicle_added_date', 'first_completed_date']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Create target variable: did the driver take their first trip?
    df['started_driving'] = (~df['first_completed_date'].isna()).astype(int)
    
    # Create binary features for key steps
    df['bgc_completed'] = (~df['bgc_date'].isna()).astype(int)
    df['vehicle_added'] = (~df['vehicle_added_date'].isna()).astype(int)
    df['has_vehicle_info'] = (~df['vehicle_make'].isna()).astype(int)
    
    # Calculate days between signup and other events
    df['days_to_bgc'] = (df['bgc_date'] - df['signup_date']).dt.days
    df['days_to_vehicle'] = (df['vehicle_added_date'] - df['signup_date']).dt.days
    df['days_to_first_trip'] = (df['first_completed_date'] - df['signup_date']).dt.days
    
    # Create vehicle age feature
    current_year = pd.to_datetime('now').year
    df['vehicle_age'] = current_year - df['vehicle_year']
    
    # Create funnel completion status
    conditions = [
        (df['bgc_completed'] == 0) & (df['vehicle_added'] == 0),
        (df['bgc_completed'] == 1) & (df['vehicle_added'] == 0),
        (df['bgc_completed'] == 0) & (df['vehicle_added'] == 1),
        (df['bgc_completed'] == 1) & (df['vehicle_added'] == 1)
    ]
    choices = ['No BGC, No Vehicle', 'BGC Only', 'Vehicle Only', 'BGC and Vehicle']
    df['funnel_status'] = np.select(conditions, choices, default=None)
    
    return df

# Define function for exploratory data analysis
def perform_eda(df):
    """
    Perform exploratory data analysis on the driver signup data.
    
    Parameters:
    df (pd.DataFrame): Cleaned dataframe
    
    Returns:
    None
    """
    # Overall conversion rate
    total_drivers = len(df)
    converted_drivers = df['started_driving'].sum()
    conversion_rate = converted_drivers / total_drivers * 100
    
    print(f"\nOverall Conversion Stats:")
    print(f"Total Drivers: {total_drivers}")
    print(f"Drivers who took first trip: {converted_drivers}")
    print(f"Conversion Rate: {conversion_rate:.2f}%")
    
    # Conversion by background check status
    bgc_stats = df.groupby('bgc_completed')['started_driving'].agg(['count', 'sum'])
    bgc_stats['conversion_rate'] = bgc_stats['sum'] / bgc_stats['count'] * 100
    
    print("\nConversion by Background Check Status:")
    print(f"BGC Completed: {bgc_stats.loc[1, 'conversion_rate']:.2f}% ({bgc_stats.loc[1, 'sum']}/{bgc_stats.loc[1, 'count']})")
    print(f"No BGC: {bgc_stats.loc[0, 'conversion_rate']:.2f}% ({bgc_stats.loc[0, 'sum']}/{bgc_stats.loc[0, 'count']})")
    
    # Conversion by vehicle addition status
    vehicle_stats = df.groupby('vehicle_added')['started_driving'].agg(['count', 'sum'])
    vehicle_stats['conversion_rate'] = vehicle_stats['sum'] / vehicle_stats['count'] * 100
    
    print("\nConversion by Vehicle Addition Status:")
    print(f"Vehicle Added: {vehicle_stats.loc[1, 'conversion_rate']:.2f}% ({vehicle_stats.loc[1, 'sum']}/{vehicle_stats.loc[1, 'count']})")
    print(f"No Vehicle: {vehicle_stats.loc[0, 'conversion_rate']:.2f}% ({vehicle_stats.loc[0, 'sum']}/{vehicle_stats.loc[0, 'count']})")
    
    # Conversion by signup channel
    channel_stats = df.groupby('signup_channel')['started_driving'].agg(['count', 'sum'])
    channel_stats['conversion_rate'] = channel_stats['sum'] / channel_stats['count'] * 100
    
    print("\nConversion by Signup Channel:")
    for channel in channel_stats.index:
        print(f"{channel}: {channel_stats.loc[channel, 'conversion_rate']:.2f}% ({channel_stats.loc[channel, 'sum']}/{channel_stats.loc[channel, 'count']})")
    
    # Conversion by funnel completion
    funnel_stats = df.groupby('funnel_status')['started_driving'].agg(['count', 'sum'])
    funnel_stats['conversion_rate'] = funnel_stats['sum'] / funnel_stats['count'] * 100
    
    print("\nConversion by Funnel Completion:")
    for status in funnel_stats.index:
        print(f"{status}: {funnel_stats.loc[status, 'conversion_rate']:.2f}% ({funnel_stats.loc[status, 'sum']}/{funnel_stats.loc[status, 'count']})")
    
    return {
        'bgc_stats': bgc_stats,
        'vehicle_stats': vehicle_stats,
        'channel_stats': channel_stats,
        'funnel_stats': funnel_stats
    }

# Define function to create visualizations
def create_visualizations(df, stats):
    """
    Create visualizations for the driver signup data.
    
    Parameters:
    df (pd.DataFrame): Cleaned dataframe
    stats (dict): Statistics from EDA
    
    Returns:
    None
    """
    # Create a figure for multiple plots
    plt.figure(figsize=(15, 15))
    
    # Plot 1: Conversion by Funnel Status
    plt.subplot(2, 2, 1)
    funnel_df = stats['funnel_stats'].reset_index()
    sns.barplot(x='funnel_status', y='conversion_rate', data=funnel_df)
    plt.title('Conversion Rate by Funnel Completion')
    plt.xlabel('Funnel Status')
    plt.ylabel('Conversion Rate (%)')
    plt.xticks(rotation=45)
    
    # Plot 2: Conversion by Signup Channel
    plt.subplot(2, 2, 2)
    channel_df = stats['channel_stats'].reset_index()
    sns.barplot(x='signup_channel', y='conversion_rate', data=channel_df)
    plt.title('Conversion Rate by Signup Channel')
    plt.xlabel('Signup Channel')
    plt.ylabel('Conversion Rate (%)')
    
    # Plot 3: Time to Complete Steps vs Conversion
    plt.subplot(2, 2, 3)
    
    # Create bins for days to BGC
    df_bgc = df[df['bgc_completed'] == 1].copy()
    df_bgc['days_to_bgc_bin'] = pd.cut(
        df_bgc['days_to_bgc'],
        bins=[-1, 0, 3, 7, 14, float('inf')],
        labels=['Same day', '1-3 days', '4-7 days', '8-14 days', '15+ days']
    )
    
    bgc_time_stats = df_bgc.groupby('days_to_bgc_bin')['started_driving'].agg(['count', 'mean'])
    bgc_time_stats['conversion_rate'] = bgc_time_stats['mean'] * 100
    
    sns.barplot(x=bgc_time_stats.index, y='conversion_rate', data=bgc_time_stats)
    plt.title('Conversion Rate by Time to Complete Background Check')
    plt.xlabel('Days to Complete BGC')
    plt.ylabel('Conversion Rate (%)')
    plt.xticks(rotation=45)
    
    # Plot 4: Distribution of Converted vs Non-converted Drivers
    plt.subplot(2, 2, 4)
    conversion_counts = df['started_driving'].value_counts()
    plt.pie(conversion_counts, labels=['Did not start driving', 'Started driving'], 
            autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff'])
    plt.title('Distribution of Driver Conversion')
    
    plt.tight_layout()
    plt.savefig('uber_driver_conversion_analysis.png')
    plt.close()
    
    # Create a second figure for vehicle and BGC impact
    plt.figure(figsize=(12, 5))
    
    # Plot 1: Impact of BGC completion
    plt.subplot(1, 2, 1)
    bgc_df = stats['bgc_stats'].reset_index()
    bgc_df['status'] = bgc_df['bgc_completed'].map({0: 'No BGC', 1: 'BGC Completed'})
    sns.barplot(x='status', y='conversion_rate', data=bgc_df)
    plt.title('Impact of Background Check on Conversion')
    plt.xlabel('Background Check Status')
    plt.ylabel('Conversion Rate (%)')
    
    # Plot 2: Impact of vehicle addition
    plt.subplot(1, 2, 2)
    vehicle_df = stats['vehicle_stats'].reset_index()
    vehicle_df['status'] = vehicle_df['vehicle_added'].map({0: 'No Vehicle', 1: 'Vehicle Added'})
    sns.barplot(x='status', y='conversion_rate', data=vehicle_df)
    plt.title('Impact of Vehicle Addition on Conversion')
    plt.xlabel('Vehicle Status')
    plt.ylabel('Conversion Rate (%)')
    
    plt.tight_layout()
    plt.savefig('uber_driver_key_factors.png')
    plt.close()

# Define function to build and evaluate a predictive model
def build_predictive_model(df):
    """
    Build and evaluate a logistic regression model to predict driver conversion.
    
    Parameters:
    df (pd.DataFrame): Cleaned dataframe
    
    Returns:
    model: Trained model
    X_test: Test features
    y_test: Test targets
    """
    # Select features and target
    features = ['bgc_completed', 'vehicle_added', 'has_vehicle_info', 'signup_channel', 'city_name']
    target = 'started_driving'
    
    # Select only the rows with non-null values for essential features
    model_df = df[features + [target]].dropna()
    
    # Split the data
    X = model_df[features]
    y = model_df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create preprocessing pipeline
    categorical_features = ['signup_channel', 'city_name']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )
    
    # Create model pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42))
    ])
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    print("\nModel Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    print("\nConfusion Matrix:")
    print(cm)
    
    # Get feature importances
    feature_names = (model.named_steps['preprocessor']
                    .transformers_[0][1]
                    .get_feature_names_out(categorical_features))
    feature_names = np.append(feature_names, ['bgc_completed', 'vehicle_added', 'has_vehicle_info'])
    
    # Extract coefficients
    coefficients = model.named_steps['classifier'].coef_[0]
    
    # Create DataFrame of feature importances
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': np.abs(coefficients)
    }).sort_values(by='Importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance)
    
    return model, X_test, y_test, feature_importance

# Function to visualize model performance
def visualize_model_performance(model, X_test, y_test, feature_importance):
    """
    Visualize model performance and feature importance.
    
    Parameters:
    model: Trained model
    X_test: Test features
    y_test: Test targets
    feature_importance: DataFrame of feature importances
    
    Returns:
    None
    """
    # Create a figure for multiple plots
    plt.figure(figsize=(12, 10))
    
    # Plot 1: Confusion Matrix
    plt.subplot(2, 1, 1)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    
    # Plot 2: Feature Importance
    plt.subplot(2, 1, 2)
    top_features = feature_importance.head(10)
    sns.barplot(x='Importance', y='Feature', data=top_features)
    plt.title('Top 10 Feature Importances')
    plt.xlabel('Absolute Coefficient Value')
    plt.ylabel('Feature')
    
    plt.tight_layout()
    plt.savefig('uber_driver_model_performance.png')
    plt.close()

# Main function to run the entire analysis
def main(file_path):
    """
    Run the entire analysis pipeline.
    
    Parameters:
    file_path (str): Path to the CSV file
    
    Returns:
    None
    """
    print("Loading and cleaning data...")
    df = load_and_clean_data(file_path)
    
    print("\nPerforming exploratory data analysis...")
    stats = perform_eda(df)
    
    print("\nCreating visualizations...")
    create_visualizations(df, stats)
    
    print("\nBuilding predictive model...")
    model, X_test, y_test, feature_importance = build_predictive_model(df)
    
    print("\nVisualizing model performance...")
    visualize_model_performance(model, X_test, y_test, feature_importance)
    
    print("\nAnalysis complete. Visualizations saved to files.")

# Example usage
if __name__ == "__main__":
    main('data1.csv')  # Replace with your file path

Loading and cleaning data...
Data shape: (54681, 11)
Missing values:
id                          0
city_name                   0
signup_os                6857
signup_channel              0
signup_date                 0
bgc_date                21785
vehicle_added_date      41547
vehicle_make            41458
vehicle_model           41458
vehicle_year            41458
first_completed_date    48544
dtype: int64

Performing exploratory data analysis...

Overall Conversion Stats:
Total Drivers: 54681
Drivers who took first trip: 6137
Conversion Rate: 11.22%

Conversion by Background Check Status:
BGC Completed: 18.66% (6137/32896)
No BGC: 0.00% (0/21785)

Conversion by Vehicle Addition Status:
Vehicle Added: 44.71% (5872/13134)
No Vehicle: 0.64% (265/41547)

Conversion by Signup Channel:
Organic: 9.01% (1210/13427)
Paid: 6.19% (1482/23938)
Referral: 19.89% (3445/17316)

Conversion by Funnel Completion:
BGC Only: 1.32% (265/20017)
BGC and Vehicle: 45.59% (5872/12879)
No BGC, No Vehicle: 0.00

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from datetime import datetime

def generate_recommendations(df, model, feature_importance):
    """
    Generate business recommendations based on the analysis results.
    
    Parameters:
    df (pd.DataFrame): Cleaned dataframe
    model: Trained model
    feature_importance: DataFrame of feature importances
    
    Returns:
    None
    """
    print("\n================= UBER DRIVER CONVERSION RECOMMENDATIONS =================\n")
    
    # 1. Summary of findings
    total_drivers = len(df)
    converted_drivers = df['started_driving'].sum()
    conversion_rate = converted_drivers / total_drivers * 100
    
    print(f"EXECUTIVE SUMMARY:")
    print(f"Analysis of {total_drivers:,} driver signups reveals an overall conversion rate of {conversion_rate:.2f}%")
    print(f"This means that {converted_drivers:,} drivers completed their first trip, while {total_drivers - converted_drivers:,} did not.")
    print(f"Based on our analysis, we've identified key factors affecting conversion and developed recommendations.")
    
    # 2. Key findings
    print("\nKEY FINDINGS:")
    
    # BGC completion impact
    bgc_completed = df[df['bgc_completed'] == 1]
    bgc_conversion = bgc_completed['started_driving'].mean() * 100
    print(f"1. Background Check Completion: Drivers who complete their background check are {bgc_conversion:.1f}x more likely to take their first trip.")
    
    # Vehicle addition impact
    vehicle_added = df[df['vehicle_added'] == 1]
    vehicle_conversion = vehicle_added['started_driving'].mean() * 100
    no_vehicle_conversion = df[df['vehicle_added'] == 0]['started_driving'].mean() * 100
    vehicle_lift = vehicle_conversion / (no_vehicle_conversion if no_vehicle_conversion > 0 else 1)
    print(f"2. Vehicle Addition: Drivers who add a vehicle are {vehicle_lift:.1f}x more likely to take their first trip.")
    
    # Referral channel impact
    referral_conversion = df[df['signup_channel'] == 'Referral']['started_driving'].mean() * 100
    paid_conversion = df[df['signup_channel'] == 'Paid']['started_driving'].mean() * 100
    channel_lift = referral_conversion / paid_conversion
    print(f"3. Signup Channel: Drivers from referrals convert at {referral_conversion:.2f}%, which is {channel_lift:.1f}x better than paid channels.")
    
    # Funnel completion
    both_steps = df[(df['bgc_completed'] == 1) & (df['vehicle_added'] == 1)]
    both_conversion = both_steps['started_driving'].mean() * 100
    print(f"4. Full Funnel Completion: Drivers who complete both BGC and add a vehicle convert at {both_conversion:.2f}%.")
    
    # 3. Recommendations
    print("\nRECOMMENDATIONS:")
    
    # Generate intervention priorities
    no_steps = df[(df['bgc_completed'] == 0) & (df['vehicle_added'] == 0)]
    bgc_only = df[(df['bgc_completed'] == 1) & (df['vehicle_added'] == 0)]
    vehicle_only = df[(df['bgc_completed'] == 0) & (df['vehicle_added'] == 1)]
    
    print("1. Prioritize Vehicle Addition Process")
    print("   - The data shows vehicle addition is the strongest predictor of first trips")
    print("   - Implement a simplified vehicle registration flow with fewer steps")
    print("   - Create guided tutorials for vehicle upload to reduce friction")
    print("   - Send targeted reminders to drivers who haven't added vehicle information")
    print(f"   - Targeting the {len(bgc_only):,} drivers who completed BGC but haven't added a vehicle would yield highest ROI")
    
    print("\n2. Streamline Background Check Process")
    print("   - Optimize the background check submission workflow to reduce drop-offs")
    print("   - Improve communication about the BGC timeline and expectations")
    print("   - Implement a progress tracker to encourage completion")
    print("   - Send automated reminders with direct links to resume the process")
    
    print("\n3. Expand the Referral Program")
    print(f"   - Referrals convert at {referral_conversion:.2f}%, significantly higher than other channels")
    print("   - Consider increasing referral bonuses to generate more referral signups")
    print("   - Create a buddy system to pair new drivers with experienced drivers")
    print("   - Develop team incentives for referring drivers to help others complete their first trip")
    
    print("\n4. Implement Targeted Interventions")
    print("   - Use the predictive model to identify drivers most likely to convert")
    print("   - Create a tiered re-engagement strategy for stalled applications")
    print("   - Pilot a high-touch concierge service for high-potential drivers")
    print("   - Develop custom messaging based on where drivers get stuck in the funnel")
    
    print("\n5. Improve Paid Channel Quality")
    print(f"   - Paid channels have the lowest conversion at {paid_conversion:.2f}%")
    print("   - Revisit targeting criteria to attract more qualified potential drivers")
    print("   - Test different messaging that sets clearer expectations about the process")
    print("   - Consider requiring more pre-qualification in paid campaigns")
    
    # 4. Implementation roadmap
    print("\nIMPLEMENTATION ROADMAP:")
    
    current_month = datetime.now().strftime("%B %Y")
    
    print(f"Phase 1: Quick Wins (Next 30 Days)")
    print(f"  • Email/SMS campaign for drivers who completed background checks but haven't added vehicles")
    print(f"  • Simplify the vehicle information form to require only essential information")
    print(f"  • Implement progress visualization in the driver app")
    
    print(f"\nPhase 2: Medium-Term Solutions (60-90 Days)")
    print(f"  • Develop a predictive intervention system using the model")
    print(f"  • Redesign the vehicle addition flow with a guided experience")
    print(f"  • Expand the referral program with new incentives")
    print(f"  • Create a concierge service pilot for high-potential drivers")
    
    print(f"\nPhase 3: Longer-Term Initiatives (90+ Days)")
    print(f"  • Build an integrated onboarding experience with personalized guidance")
    print(f"  • Develop partnerships to help drivers overcome vehicle barriers")
    print(f"  • Implement A/B testing program for continuous funnel optimization")
    print(f"  • Create a driver community platform to facilitate peer support")
    
    # 5. Expected impact
    current_conversion = conversion_rate / 100
    improvement_scenarios = {
        'Conservative': current_conversion * 1.15,  # 15% improvement
        'Moderate': current_conversion * 1.3,      # 30% improvement
        'Aggressive': current_conversion * 1.5      # 50% improvement
    }
    
    print("\nEXPECTED IMPACT:")
    print(f"Current Driver Conversion: {conversion_rate:.2f}% ({converted_drivers:,} of {total_drivers:,} drivers)")
    
    for scenario, new_rate in improvement_scenarios.items():
        new_drivers = int(total_drivers * new_rate)
        additional_drivers = new_drivers - converted_drivers
        print(f"{scenario} Scenario: {new_rate*100:.2f}% conversion (+{additional_drivers:,} drivers)")
    
    print("\n==========================================================================")
    
    # Create an impact visualization
    plt.figure(figsize=(10, 6))
    
    scenarios = list(improvement_scenarios.keys())
    current = ['Current']
    all_scenarios = current + scenarios
    
    rates = [current_conversion * 100] + [improvement_scenarios[s] * 100 for s in scenarios]
    drivers = [converted_drivers] + [int(total_drivers * improvement_scenarios[s]) for s in scenarios]
    
    # Create a bar chart
    bars = plt.bar(all_scenarios, rates, color=['#d3d3d3', '#90CAF9', '#42A5F5', '#1976D2'])
    
    # Add data labels
    for i, bar in enumerate(bars):
        height = bar.get_height()
        additional = ''
        if i > 0:
            additional = f" (+{drivers[i] - converted_drivers:,})"
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f"{rates[i]:.2f}%{additional}", ha='center', va='bottom', fontweight='bold')
    
    plt.title('Projected Impact of Recommendations on Driver Conversion Rate')
    plt.ylabel('Conversion Rate (%)')
    plt.ylim(0, max(rates) * 1.2)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.savefig('uber_driver_impact_projections.png')
    plt.close()
    
    # Create ROC curve for model evaluation
    if model is not None:
        try:
            # Get probabilities for the positive class
            y_true = df['started_driving']
            y_proba = model.predict_proba(df[['bgc_completed', 'vehicle_added', 'has_vehicle_info']])[:, 1]
            
            # Calculate ROC curve
            fpr, tpr, thresholds = roc_curve(y_true, y_proba)
            roc_auc = auc(fpr, tpr)
            
            # Plot ROC curve
            plt.figure(figsize=(8, 6))
            plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic (ROC) Curve')
            plt.legend(loc="lower right")
            plt.grid(alpha=0.3)
            
            plt.tight_layout()
            plt.savefig('uber_driver_model_roc_curve.png')
            plt.close()
        except:
            print("Note: ROC curve could not be generated with the provided model.")
            
    # Generate a summary document
    summary = f"""
    # Uber Driver Conversion Analysis - Executive Summary
    
    Date: {datetime.now().strftime("%B %d, %Y")}
    
    ## Current Performance
    - Total Driver Signups: {total_drivers:,}
    - Drivers Completed First Trip: {converted_drivers:,}
    - Overall Conversion Rate: {conversion_rate:.2f}%
    
    ## Key Findings
    
    1. **Background Check Completion** is a critical step, with drivers who complete it being {bgc_conversion:.1f}x more likely to take their first trip.
    
    2. **Vehicle Addition** is the strongest predictor of conversion, with a {vehicle_lift:.1f}x lift in conversion rate.
    
    3. **Referral Channel** produces the highest quality signups, with a conversion rate of {referral_conversion:.2f}%, which is {channel_lift:.1f}x better than paid channels.
    
    4. **Full Funnel Completion** (both BGC and vehicle) results in a {both_conversion:.2f}% conversion rate.
    
    ## Top Recommendations
    
    1. Prioritize Vehicle Addition Process
    2. Streamline Background Check Process
    3. Expand the Referral Program
    4. Implement Targeted Interventions using the Predictive Model
    5. Improve Paid Channel Quality
    
    ## Expected Impact
    
    - Conservative Scenario: {improvement_scenarios['Conservative']*100:.2f}% conversion (+{int(total_drivers * improvement_scenarios['Conservative']) - converted_drivers:,} drivers)
    - Moderate Scenario: {improvement_scenarios['Moderate']*100:.2f}% conversion (+{int(total_drivers * improvement_scenarios['Moderate']) - converted_drivers:,} drivers)
    - Aggressive Scenario: {improvement_scenarios['Aggressive']*100:.2f}% conversion (+{int(total_drivers * improvement_scenarios['Aggressive']) - converted_drivers:,} drivers)
    """
    
    with open('uber_driver_conversion_summary.md', 'w') as f:
        f.write(summary)
        
    print("\nExecutive summary saved to 'uber_driver_conversion_summary.md'")

# Example of how to use this with the analysis results
if __name__ == "__main__":
    # This would typically be run after the main analysis
    # For demonstration, we'll create some dummy data
    
    # Create dummy DataFrame
    data = {
        'started_driving': [0] * 48544 + [1] * 6137,
        'bgc_completed': [0] * 21785 + [1] * 32896,
        'vehicle_added': [0] * 41547 + [1] * 13134,
        'has_vehicle_info': [0] * 41458 + [1] * 13223,
        'signup_channel': ['Paid'] * 23938 + ['Organic'] * 13427 + ['Referral'] * 17316
    }
    
    df = pd.DataFrame(data)
    
    # Create dummy feature importance
    feature_importance = pd.DataFrame({
        'Feature': ['vehicle_added', 'has_vehicle_info', 'signup_channel_Paid', 
                   'signup_channel_Organic', 'signup_channel_Referral', 'bgc_completed'],
        'Importance': [1.9754, 1.9339, 1.3977, 1.2730, 0.8387, 0.4630]
    })
    
    # Generate recommendations without a model
    generate_recommendations(df, None, feature_importance)



EXECUTIVE SUMMARY:
Analysis of 54,681 driver signups reveals an overall conversion rate of 11.22%
This means that 6,137 drivers completed their first trip, while 48,544 did not.
Based on our analysis, we've identified key factors affecting conversion and developed recommendations.

KEY FINDINGS:
1. Background Check Completion: Drivers who complete their background check are 18.7x more likely to take their first trip.
2. Vehicle Addition: Drivers who add a vehicle are 46.7x more likely to take their first trip.
3. Signup Channel: Drivers from referrals convert at 35.44%, which is infx better than paid channels.
4. Full Funnel Completion: Drivers who complete both BGC and add a vehicle convert at 46.73%.

RECOMMENDATIONS:
1. Prioritize Vehicle Addition Process
   - The data shows vehicle addition is the strongest predictor of first trips
   - Implement a simplified vehicle registration flow with fewer steps
   - Create guided tutorials for vehicle upload to reduce friction
   - Send ta

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as mpatches

# Set style for better visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['axes.facecolor'] = '#f8f9fa'
plt.rcParams['figure.facecolor'] = 'white'

def create_advanced_visualizations(df):
    """
    Create advanced visualizations for the Uber driver conversion analysis.
    
    Parameters:
    df (pd.DataFrame): Cleaned dataframe
    
    Returns:
    None
    """
    # Create a custom colormap for Uber-like branding
    uber_colors = ['#276EF1', '#9CB3FF', '#000000', '#333333']
    uber_cmap = LinearSegmentedColormap.from_list('uber', uber_colors)
    
    # 1. Funnel Visualization
    create_conversion_funnel(df)
    
    # 2. Heatmap of conversion by channel and completion
    create_completion_heatmap(df)
    
    # 3. Driver journey timeline
    create_driver_journey_timeline(df)
    
    # 4. Feature importance visualization
    create_feature_importance_chart(df)
    
    # 5. Conversion prediction calibration
    create_conversion_prediction_chart(df)

def create_conversion_funnel(df):
    """Create a conversion funnel visualization."""
    # Count users at each stage of the funnel
    total_signups = len(df)
    bgc_completed = df['bgc_completed'].sum()
    vehicle_added = df['vehicle_added'].sum()
    both_completed = ((df['bgc_completed'] == 1) & (df['vehicle_added'] == 1)).sum()
    converted = df['started_driving'].sum()
    
    # Calculate percentages
    bgc_pct = bgc_completed / total_signups * 100
    vehicle_pct = vehicle_added / total_signups * 100
    both_pct = both_completed / total_signups * 100
    converted_pct = converted / total_signups * 100
    
    # Create the funnel chart
    plt.figure(figsize=(12, 8))
    
    # Define the stages
    stages = ['Signups', 'BGC Completed', 'Vehicle Added', 'Both Steps Completed', 'First Trip Completed']
    values = [total_signups, bgc_completed, vehicle_added, both_completed, converted]
    percentages = [100, bgc_pct, vehicle_pct, both_pct, converted_pct]
    
    # Plot the funnel
    colors = ['#1E88E5', '#42A5F5', '#64B5F6', '#90CAF9', '#BBDEFB']
    
    # Create bars
    y_pos = np.arange(len(stages))
    bars = plt.barh(y_pos, percentages, color=colors)
    
    # Add stage labels
    for i, (bar, value, pct) in enumerate(zip(bars, values, percentages)):
        plt.text(5, i, f"{stages[i]}", va='center', fontweight='bold', fontsize=12, color='black')
        plt.text(bar.get_width() + 2, i, f"{value:,} ({pct:.1f}%)", va='center')
    
    # Customize the plot
    plt.xlim(0, 105)  # Make room for the percentage labels
    plt.yticks([])  # Hide y-axis labels
    plt.xlabel('Percentage of Initial Signups')
    plt.title('Driver Conversion Funnel', fontsize=14, fontweight='bold', pad=20)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    
    # Add annotations
    drop_after_signup = 100 - bgc_pct
    plt.annotate(f'Drop-off: {drop_after_signup:.1f}%', 
                xy=(50, 0.5), xytext=(50, 0.5 - 0.4),
                arrowprops=dict(arrowstyle='->'))
    
    drop_before_trip = both_pct - converted_pct
    plt.annotate(f'Drop-off: {drop_before_trip:.1f}%', 
                xy=(50, 3.5), xytext=(50, 3.5 - 0.4),
                arrowprops=dict(arrowstyle='->'))
    
    plt.tight_layout()
    plt.savefig('uber_driver_conversion_funnel.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_completion_heatmap(df):
    """Create a heatmap of conversion rates by channel and completion status."""
    # Create cross-tabulation of conversion rate by channel and funnel status
    crosstab_data = []
    for channel in df['signup_channel'].unique():
        for bgc in [0, 1]:
            for vehicle in [0, 1]:
                filtered = df[(df['signup_channel'] == channel) & 
                            (df['bgc_completed'] == bgc) & 
                            (df['vehicle_added'] == vehicle)]
                
                if len(filtered) > 0:
                    conversion_rate = filtered['started_driving'].mean() * 100
                    count = len(filtered)
                    crosstab_data.append({
                        'Channel': channel,
                        'BGC': 'Completed' if bgc == 1 else 'Not Completed',
                        'Vehicle': 'Added' if vehicle == 1 else 'Not Added',
                        'Conversion': conversion_rate,
                        'Count': count
                    })
    
    # Convert to DataFrame
    crosstab_df = pd.DataFrame(crosstab_data)
    
    # Pivot for the heatmap
    heatmap_data = crosstab_df.pivot_table(
        index=['Channel', 'BGC'],
        columns='Vehicle',
        values='Conversion',
        aggfunc='mean'
    ).fillna(0)
    
    # Count pivot for annotations
    count_data = crosstab_df.pivot_table(
        index=['Channel', 'BGC'],
        columns='Vehicle',
        values='Count',
        aggfunc='sum'
    ).fillna(0)
    
    # Create a figure
    plt.figure(figsize=(12, 10))
    
    # Create heatmap
    ax = sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='YlGnBu', 
                   linewidths=.5, annot_kws={"size": 10})
    
    # Add count annotations
    for i, idx in enumerate(heatmap_data.index):
        for j, col in enumerate(heatmap_data.columns):
            count = count_data.loc[idx, col]
            plt.text(j+0.5, i+0.7, f"n={count:,}", ha='center', va='center', 
                   color='black', fontsize=8)
    
    # Customize the plot
    plt.title('Driver Conversion Rate (%) by Channel, BGC, and Vehicle Status', 
             fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.savefig('uber_driver_conversion_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_driver_journey_timeline(df):
    """Create a visualization of the driver journey timeline."""
    # Filter only converted drivers with complete data
    timeline_df = df[(df['started_driving'] == 1) & 
                    (~df['days_to_bgc'].isnull()) & 
                    (~df['days_to_vehicle'].isnull()) & 
                    (~df['days_to_first_trip'].isnull())].copy()
    
    if len(timeline_df) == 0:
        return  # Not enough data
    
    # Calculate average days for each milestone
    avg_to_bgc = timeline_df['days_to_bgc'].median()
    avg_to_vehicle = timeline_df['days_to_vehicle'].median()
    avg_to_trip = timeline_df['days_to_first_trip'].median()
    
    # Create bins for time to first trip
    timeline_df['time_to_trip_category'] = pd.cut(
        timeline_df['days_to_first_trip'],
        bins=[0, 1, 3, 7, 14, 30, float('inf')],
        labels=['Same day', '1-3 days', '4-7 days', '8-14 days', '15-30 days', '30+ days']
    )
    
    trip_time_dist = timeline_df['time_to_trip_category'].value_counts(normalize=True) * 100
    
    # Create the figure
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), gridspec_kw={'height_ratios': [1, 2]})
    
    # Plot 1: Timeline
    milestones = ['Signup', 'BGC Completion', 'Vehicle Addition', 'First Trip']
    days = [0, avg_to_bgc, avg_to_vehicle, avg_to_trip]
    
    ax1.plot(days, [1, 1, 1, 1], 'o-', markersize=10, linewidth=2, color='#1976D2')
    
    # Add milestone labels
    for i, (milestone, day) in enumerate(zip(milestones, days)):
        ax1.annotate(f"{milestone}\nDay {day:.1f}", 
                   (day, 1), 
                   xytext=(0, 20), 
                   textcoords='offset points',
                   ha='center', 
                   fontweight='bold' if i == 0 or i == 3 else 'normal')
    
    # Configure the timeline
    ax1.set_ylim(0.5, 1.5)
    ax1.set_yticks([])
    ax1.set_xlabel('Days Since Signup')
    ax1.set_title('Driver Journey Timeline (Median Days)', fontsize=14, fontweight='bold')
    ax1.grid(axis='x', linestyle='--', alpha=0.7)
    
    # Plot 2: Distribution of time to first trip
    sns.barplot(x=trip_time_dist.index, y=trip_time_dist.values, ax=ax2, color='#64B5F6')
    
    # Add percentage labels
    for i, v in enumerate(trip_time_dist.values):
        ax2.text(i, v + 1, f"{v:.1f}%", ha='center')
    
    ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45)
    ax2.set_title('Distribution of Time to First Trip', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Time to First Trip')
    ax2.set_ylabel('Percentage of Converted Drivers')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.savefig('uber_driver_journey_timeline.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_feature_importance_chart(df):
    """Create a feature importance visualization based on conversion lift."""
    # Calculate lift for key features
    base_conversion = df['started_driving'].mean() * 100
    
    # Calculate lift for each feature
    feature_lifts = []
    
    # BGC completion
    bgc_conversion = df[df['bgc_completed'] == 1]['started_driving'].mean() * 100
    bgc_lift = bgc_conversion / base_conversion
    feature_lifts.append(('Background Check\nCompleted', bgc_lift))
    
    # Vehicle addition
    vehicle_conversion = df[df['vehicle_added'] == 1]['started_driving'].mean() * 100
    vehicle_lift = vehicle_conversion / base_conversion
    feature_lifts.append(('Vehicle\nAdded', vehicle_lift))
    
    # Referral channel
    referral_conversion = df[df['signup_channel'] == 'Referral']['started_driving'].mean() * 100
    referral_lift = referral_conversion / base_conversion
    feature_lifts.append(('Referral\nChannel', referral_lift))
    
    # Organic channel
    organic_conversion = df[df['signup_channel'] == 'Organic']['started_driving'].mean() * 100
    organic_lift = organic_conversion / base_conversion
    feature_lifts.append(('Organic\nChannel', organic_lift))
    
    # Both steps completed
    both_conversion = df[(df['bgc_completed'] == 1) & (df['vehicle_added'] == 1)]['started_driving'].mean() * 100
    both_lift = both_conversion / base_conversion
    feature_lifts.append(('Both BGC &\nVehicle', both_lift))
    
    # Create DataFrame
    lift_df = pd.DataFrame(feature_lifts, columns=['Feature', 'Lift'])
    lift_df = lift_df.sort_values('Lift', ascending=False)
    
    # Create the chart
    plt.figure(figsize=(10, 6))
    bars = plt.bar(lift_df['Feature'], lift_df['Lift'], color='#42A5F5')
    
    # Add baseline
    plt.axhline(y=1, color='red', linestyle='--', label='Baseline')
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                f"{height:.1f}x", ha='center', va='bottom', fontweight='bold')
    
    plt.title('Conversion Rate Lift by Feature', fontsize=14, fontweight='bold')
    plt.ylabel('Lift Multiple (Compared to Baseline)')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('uber_driver_conversion_lift.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_conversion_prediction_chart(df):
    """Create a chart showing predicted conversion based on completed steps."""
    # Calculate probability of conversion based on funnel stage
    stages = []
    
    # No steps
    no_steps_conv = df[(df['bgc_completed'] == 0) & (df['vehicle_added'] == 0)]['started_driving'].mean() * 100
    no_steps_count = len(df[(df['bgc_completed'] == 0) & (df['vehicle_added'] == 0)])
    stages.append(('No Steps\nCompleted', no_steps_conv, no_steps_count))
    
    # BGC only
    bgc_only_conv = df[(df['bgc_completed'] == 1) & (df['vehicle_added'] == 0)]['started_driving'].mean() * 100
    bgc_only_count = len(df[(df['bgc_completed'] == 1) & (df['vehicle_added'] == 0)])
    stages.append(('BGC Only', bgc_only_conv, bgc_only_count))
    
    # Vehicle only
    vehicle_only_conv = df[(df['bgc_completed'] == 0) & (df['vehicle_added'] == 1)]['started_driving'].mean() * 100
    vehicle_only_count = len(df[(df['bgc_completed'] == 0) & (df['vehicle_added'] == 1)])
    stages.append(('Vehicle Only', vehicle_only_conv, vehicle_only_count))
    
    # Both steps
    both_conv = df[(df['bgc_completed'] == 1) & (df['vehicle_added'] == 1)]['started_driving'].mean() * 100
    both_count = len(df[(df['bgc_completed'] == 1) & (df['vehicle_added'] == 1)])
    stages.append(('Both BGC &\nVehicle', both_conv, both_count))
    
    # Create DataFrame
    stages_df = pd.DataFrame(stages, columns=['Stage', 'Conversion', 'Count'])
    
    # Create the chart
    plt.figure(figsize=(12, 8))
    
    # Create gradient colors based on conversion rate
    norm = plt.Normalize(stages_df['Conversion'].min(), stages_df['Conversion'].max())
    colors = plt.cm.viridis(norm(stages_df['Conversion']))
    
    # Create bars with counts as width
    bars = plt.bar(stages_df['Stage'], stages_df['Conversion'], color=colors)
    
    # Add value labels
    for i, bar in enumerate(bars):
        height = bar.get_height()
        count = stages_df.iloc[i]['Count']
        pct_of_total = count / len(df) * 100
        
        plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                f"{height:.1f}%", ha='center', va='bottom', fontweight='bold')
        
        plt.text(bar.get_x() + bar.get_width()/2., height/2,
                f"n={count:,}\n({pct_of_total:.1f}% of total)", 
                ha='center', va='center', color='white' if height > 20 else 'black',
                fontweight='bold')
    
    plt.title('Conversion Rate by Funnel Stage', fontsize=14, fontweight='bold')
    plt.ylabel('Conversion Rate (%)')
    plt.ylim(0, stages_df['Conversion'].max() * 1.2)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add annotations for key insights
    if both_conv > 0:
        plt.annotate(f"{both_conv:.1f}% conversion when\nboth steps completed",
                   xy=(3, both_conv), xytext=(3.3, both_conv * 0.7),
                   arrowprops=dict(arrowstyle='->', color='black'))
    
    # Add "opportunity" annotation
    for i, row in stages_df.iterrows():
        if row['Stage'] == 'BGC Only' and row['Count'] > 1000:
            plt.annotate("Major opportunity:\nHelp these drivers\nadd vehicles",
                       xy=(i, row['Conversion']), xytext=(i - 0.5, row['Conversion'] + 20),
                       arrowprops=dict(arrowstyle='->', color='red'))
    
    plt.tight_layout()
    plt.savefig('uber_driver_stage_conversion.png', dpi=300, bbox_inches='tight')
    plt.close()

# Sample execution code
if __name__ == "__main__":
    # Create a sample dataset based on the statistics we've observed
    np.random.seed(42)
    
    # Create data for 54,681 drivers
    n_drivers = 54681
    
    # Create BGC and vehicle statuses based on observed proportions
    bgc_completed = np.zeros(n_drivers, dtype=int)
    bgc_completed[:32896] = 1
    
    vehicle_added = np.zeros(n_drivers, dtype=int)
    vehicle_added[:13134] = 1
    
    # Shuffle to ensure random distribution
    np.random.shuffle(bgc_completed)
    np.random.shuffle(vehicle_added)
    
    # Create signup channel values
    channels = np.empty(n_drivers, dtype=object)
    channels[:23938] = 'Paid'
    channels[23938:23938+13427] = 'Organic'
    channels[23938+13427:] = 'Referral'
    np.random.shuffle(channels)
    
    # Create the started_driving target variable based on combinations
    started_driving = np.zeros(n_drivers, dtype=int)
    
    # For BGC & vehicle completed, ~45% conversion rate
    both_indices = np.where((bgc_completed == 1) & (vehicle_added == 1))[0]
    conversion_mask = np.random.random(len(both_indices)) < 0.456
    started_driving[both_indices[conversion_mask]] = 1
    
    # For BGC only, ~1.3% conversion rate
    bgc_only_indices = np.where((bgc_completed == 1) & (vehicle_added == 0))[0]
    conversion_mask = np.random.random(len(bgc_only_indices)) < 0.013
    started_driving[bgc_only_indices[conversion_mask]] = 1
    
    # For vehicle only, ~0% conversion rate (negligible)
    
    # For referral channel, boost conversion slightly
    referral_indices = np.where(channels == 'Referral')[0]
    referral_boost_indices = np.random.choice(
        referral_indices, 
        size=int(len(referral_indices) * 0.05), 
        replace=False
    )
    started_driving[referral_boost_indices] = 1
    
    # Ensure the total conversion rate is around 11.22%
    total_converted = started_driving.sum()
    target_converted = int(n_drivers * 0.1122)
    
    if total_converted < target_converted:
        # Need to convert more
        not_converted = np.where(started_driving == 0)[0]
        to_convert = np.random.choice(
            not_converted,
            size=target_converted - total_converted,
            replace=False
        )
        started_driving[to_convert] = 1
    elif total_converted > target_converted:
        # Need to unconvert some
        converted = np.where(started_driving == 1)[0]
        to_unconvert = np.random.choice(
            converted,
            size=total_converted - target_converted,
            replace=False
        )
        started_driving[to_unconvert] = 0
    
    # Create time-related features
    days_to_bgc = np.zeros(n_drivers)
    days_to_vehicle = np.zeros(n_drivers)
    days_to_first_trip = np.zeros(n_drivers)
    
    # For drivers who completed BGC
    bgc_indices = np.where(bgc_completed == 1)[0]
    days_to_bgc[bgc_indices] = np.random.exponential(scale=5, size=len(bgc_indices))
    
    # For drivers who added vehicles
    vehicle_indices = np.where(vehicle_added == 1)[0]
    days_to_vehicle[vehicle_indices] = np.random.exponential(scale=7, size=len(vehicle_indices))
    
    # For drivers who started driving
    converted_indices = np.where(started_driving == 1)[0]
    days_to_first_trip[converted_indices] = np.maximum(
        days_to_bgc[converted_indices],
        days_to_vehicle[converted_indices]
    ) + np.random.exponential(scale=3, size=len(converted_indices))
    
    # Create the dataframe
    df = pd.DataFrame({
        'started_driving': started_driving,
        'bgc_completed': bgc_completed,
        'vehicle_added': vehicle_added,
        'has_vehicle_info': vehicle_added,  # Simplified assumption
        'signup_channel': channels,
        'days_to_bgc': days_to_bgc,
        'days_to_vehicle': days_to_vehicle,
        'days_to_first_trip': days_to_first_trip
    })
    
    # Create all visualizations
    create_advanced_visualizations(df)

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report, roc_curve, auc, 
    precision_recall_curve, average_precision_score
)
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

def load_and_prepare_data(file_path):
    """
    Load and prepare the data for modeling.
    
    Parameters:
    file_path (str): Path to the CSV file
    
    Returns:
    tuple: X, y, feature_names
    """
    # Read the data
    df = pd.read_csv(file_path, na_values="NA")
    
    # Convert date columns to datetime
    date_columns = ['signup_date', 'bgc_date', 'vehicle_added_date', 'first_completed_date']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Create target variable: did the driver take their first trip?
    df['started_driving'] = (~df['first_completed_date'].isna()).astype(int)
    
    # Create binary features for key steps
    df['bgc_completed'] = (~df['bgc_date'].isna()).astype(int)
    df['vehicle_added'] = (~df['vehicle_added_date'].isna()).astype(int)
    df['has_vehicle_info'] = (~df['vehicle_make'].isna()).astype(int)
    
    # Calculate days between signup and other events
    df['days_to_bgc'] = (df['bgc_date'] - df['signup_date']).dt.days
    df['days_to_vehicle'] = (df['vehicle_added_date'] - df['signup_date']).dt.days
    
    # Fill NA values for days with high values (indicating not completed)
    df['days_to_bgc'] = df['days_to_bgc'].fillna(999)
    df['days_to_vehicle'] = df['days_to_vehicle'].fillna(999)
    
    # Feature for whether vehicle was added quickly (within 3 days)
    df['vehicle_added_quickly'] = ((df['days_to_vehicle'] >= 0) & (df['days_to_vehicle'] <= 3)).astype(int)
    
    # Feature for whether BGC was completed quickly (within 3 days)
    df['bgc_completed_quickly'] = ((df['days_to_bgc'] >= 0) & (df['days_to_bgc'] <= 3)).astype(int)
    
    # Create feature for BGC completed before vehicle added
    df['bgc_before_vehicle'] = ((df['bgc_completed'] == 1) & 
                              (df['vehicle_added'] == 1) & 
                              (df['days_to_bgc'] < df['days_to_vehicle'])).astype(int)
    
    # Create a feature for vehicle age (current year - vehicle year)
    if 'vehicle_year' in df.columns:
        current_year = pd.Timestamp.now().year
        df['vehicle_age'] = current_year - df['vehicle_year']
        df['vehicle_age'] = df['vehicle_age'].clip(0, 25)  # Clip to sensible range
        df['vehicle_age'] = df['vehicle_age'].fillna(-1)  # -1 indicates no vehicle
    
    # Select features for the model
    feature_columns = [
        'bgc_completed', 'vehicle_added', 'has_vehicle_info',
        'vehicle_added_quickly', 'bgc_completed_quickly', 'bgc_before_vehicle',
        'signup_channel', 'signup_os', 'city_name'
    ]
    
    if 'vehicle_age' in df.columns:
        feature_columns.append('vehicle_age')
    
    X = df[feature_columns]
    y = df['started_driving']
    
    return X, y, feature_columns

def build_and_evaluate_models(X, y, feature_names):
    """
    Build and evaluate multiple models for predicting driver conversion.
    
    Parameters:
    X: Features
    y: Target variable
    feature_names: List of feature names
    
    Returns:
    dict: Trained models
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create preprocessing pipeline
    numeric_features = [col for col in X.columns if X[col].dtype in ['int64', 'float64'] 
                      and col not in ['bgc_completed', 'vehicle_added', 'has_vehicle_info',
                                    'vehicle_added_quickly', 'bgc_completed_quickly', 'bgc_before_vehicle']]
    
    binary_features = ['bgc_completed', 'vehicle_added', 'has_vehicle_info',
                     'vehicle_added_quickly', 'bgc_completed_quickly', 'bgc_before_vehicle']
    
    categorical_features = [col for col in X.columns if X[col].dtype == 'object']
    
    # Define preprocessors
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # Include binary features as-is
    )
    
    # Create model pipelines
    models = {
        'Logistic Regression': Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
        ]),
        
        'Random Forest': Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
        ]),
        
        'Gradient Boosting': Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))
        ])
    }
    
    # Train and evaluate each model
    results = {}
    
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # ROC AUC
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        
        # Precision-Recall AUC
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_prob)
        pr_auc = average_precision_score(y_test, y_prob)
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"ROC AUC: {roc_auc:.4f}")
        print(f"PR AUC: {pr_auc:.4f}")
        print(f"Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("\n")
        
        # Store results
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'roc_auc': roc_auc,
            'pr_auc': pr_auc,
            'fpr': fpr,
            'tpr': tpr,
            'precision_curve': precision_curve,
            'recall_curve': recall_curve,
            'y_prob': y_prob
        }
    
    # Get feature importances
    try:
        # For logistic regression
        logistic_model = models['Logistic Regression'].named_steps['classifier']
        
        # Get the feature names after preprocessing
        preprocessor = models['Logistic Regression'].named_steps['preprocessor']
        cat_features = preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features)
        all_features = list(cat_features) + numeric_features + binary_features
        
        # Get coefficients
        coefs = logistic_model.coef_[0]
        
        # Create DataFrame of feature importance
        lr_importance = pd.DataFrame({
            'Feature': all_features,
            'Importance': np.abs(coefs)
        }).sort_values('Importance', ascending=False)
        
        print("Logistic Regression Feature Importance:")
        print(lr_importance.head(10))
        
        results['feature_importance'] = lr_importance
    except:
        print("Could not extract Logistic Regression feature importance.")
    
    try:
        # For Random Forest
        rf_model = models['Random Forest'].named_steps['classifier']
        result = permutation_importance(models['Random Forest'], X_test, y_test, n_repeats=10, random_state=42)
        rf_importance = pd.DataFrame({
            'Feature': X.columns,
            'Importance': result.importances_mean
        }).sort_values('Importance', ascending=False)
        
        print("\nRandom Forest Feature Importance (Permutation):")
        print(rf_importance.head(10))
        
        results['rf_importance'] = rf_importance
    except:
        print("Could not extract Random Forest feature importance.")
    
    return results, X_test, y_test

def visualize_model_results(results, X_test, y_test):
    """
    Visualize the model comparison results.
    
    Parameters:
    results: Dictionary of model results
    X_test: Test features
    y_test: Test target
    
    Returns:
    None
    """
    # Create a figure for model comparison
    plt.figure(figsize=(12, 10))
    
    # Plot 1: ROC curves
    plt.subplot(2, 2, 1)
    for name, result in results.items():
        if name not in ['feature_importance', 'rf_importance']:
            plt.plot(result['fpr'], result['tpr'], label=f"{name} (AUC = {result['roc_auc']:.3f})")
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    plt.grid(alpha=0.3)
    
    # Plot 2: Precision-Recall curves
    plt.subplot(2, 2, 2)
    for name, result in results.items():
        if name not in ['feature_importance', 'rf_importance']:
            plt.plot(result['recall_curve'], result['precision_curve'], label=f"{name} (AUC = {result['pr_auc']:.3f})")
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves')
    plt.legend(loc="lower left")
    plt.grid(alpha=0.3)
    
    # Plot 3: Model comparison metrics
    plt.subplot(2, 2, 3)
    model_names = [name for name in results if name not in ['feature_importance', 'rf_importance']]
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'pr_auc']
    metrics_display = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC', 'PR AUC']
    
    # Create a dataframe for the metrics
    metrics_data = []
    for name in model_names:
        for metric in metrics:
            metrics_data.append({
                'Model': name,
                'Metric': metrics_display[metrics.index(metric)],
                'Value': results[name][metric]
            })
    
    metrics_df = pd.DataFrame(metrics_data)
    
    # Plot the metrics
    sns.barplot(x='Metric', y='Value', hue='Model', data=metrics_df)
    plt.title('Model Performance Comparison')
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    plt.legend(title='Model')
    plt.grid(axis='y', alpha=0.3)
    
    # Plot 4: Feature importance
    plt.subplot(2, 2, 4)
    
    if 'feature_importance' in results:
        top_features = results['feature_importance'].head(10)
        sns.barplot(x='Importance', y='Feature', data=top_features)
        plt.title('Top 10 Feature Importances (Logistic Regression)')
        plt.grid(axis='x', alpha=0.3)
    elif 'rf_importance' in results:
        top_features = results['rf_importance'].head(10)
        sns.barplot(x='Importance', y='Feature', data=top_features)
        plt.title('Top 10 Feature Importances (Random Forest)')
        plt.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('uber_driver_model_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Create a figure for prediction distribution
    plt.figure(figsize=(15, 5))
    
    for i, name in enumerate(model_names):
        plt.subplot(1, len(model_names), i+1)
        
        # Extract predictions
        y_prob = results[name]['y_prob']
        
        # Plot distributions
        sns.histplot(y_prob[y_test == 0], bins=20, alpha=0.5, label='Did not convert', color='red')
        sns.histplot(y_prob[y_test == 1], bins=20, alpha=0.5, label='Converted', color='green')
        
        plt.title(f'{name} Prediction Distribution')
        plt.xlabel('Predicted Probability')
        plt.ylabel('Count')
        plt.legend()
        plt.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('uber_driver_prediction_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Create a calibration curve
    plt.figure(figsize=(10, 6))
    
    for name in model_names:
        # Create bins for calibration
        y_prob = results[name]['y_prob']
        bins = np.linspace(0, 1, 11)
        bin_midpoints = (bins[1:] + bins[:-1]) / 2
        bin_indices = np.digitize(y_prob, bins) - 1
        bin_indices = np.clip(bin_indices, 0, len(bins) - 2)
        
        bin_sums = np.bincount(bin_indices, minlength=len(bins) - 1)
        bin_true = np.bincount(bin_indices, weights=y_test, minlength=len(bins) - 1)
        bin_props = np.zeros(len(bins) - 1)
        
        nonzero = bin_sums > 0
        bin_props[nonzero] = bin_true[nonzero] / bin_sums[nonzero]
        
        plt.plot(bin_midpoints, bin_props, 'o-', label=name)
    
    plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Actual Probability')
    plt.title('Calibration Curves')
    plt.legend()
    plt.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('uber_driver_calibration_curves.png', dpi=300, bbox_inches='tight')
    plt.close()

def tune_best_model(X, y, best_model_name):
    """
    Tune the hyperparameters of the best model.
    
    Parameters:
    X: Features
    y: Target variable
    best_model_name: Name of the best model
    
    Returns:
    dict: Best parameter settings
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create preprocessing pipeline
    numeric_features = [col for col in X.columns if X[col].dtype in ['int64', 'float64'] 
                      and col not in ['bgc_completed', 'vehicle_added', 'has_vehicle_info',
                                    'vehicle_added_quickly', 'bgc_completed_quickly', 'bgc_before_vehicle']]
    
    binary_features = ['bgc_completed', 'vehicle_added', 'has_vehicle_info',
                     'vehicle_added_quickly', 'bgc_completed_quickly', 'bgc_before_vehicle']
    
    categorical_features = [col for col in X.columns if X[col].dtype == 'object']
    
    # Define preprocessors
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # Include binary features as-is
    )
    
    # Define parameter grids for each model
    param_grids = {
        'Logistic Regression': {
            'classifier__C': [0.01, 0.1, 1, 10, 100],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__solver': ['liblinear', 'saga'],
            'classifier__class_weight': ['balanced', None]
        },
        
        'Random Forest': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__class_weight': ['balanced', 'balanced_subsample', None]
        },
        
        'Gradient Boosting': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 7],
            'classifier__subsample': [0.8, 1.0]
        }
    }
    
    # Define the model pipeline
    if best_model_name == 'Logistic Regression':
        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(max_iter=1000))
        ])
    elif best_model_name == 'Random Forest':
        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42))
        ])
    elif best_model_name == 'Gradient Boosting':
        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', GradientBoostingClassifier(random_state=42))
        ])
    else:
        raise ValueError(f"Unknown model name: {best_model_name}")
    
    # Create grid search
    grid_search = GridSearchCV(
        model,
        param_grids[best_model_name],
        cv=5,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit grid search
    print(f"Tuning {best_model_name}...")
    grid_search.fit(X_train, y_train)
    
    # Print best parameters
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    # Evaluate on test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    print("\nTest set performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print(f"Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return grid_search.best_params_, best_model, X_test, y_test

def generate_decision_rules(model, feature_names, threshold=0.5):
    """
    Generate interpretable decision rules from a logistic regression model.
    
    Parameters:
    model: Trained logistic regression model
    feature_names: List of feature names
    threshold: Decision threshold
    
    Returns:
    str: Decision rules
    """
    if not hasattr(model, 'coef_'):
        return "Model does not support coefficient extraction."
    
    # Extract coefficients
    coefficients = model.coef_[0]
    intercept = model.intercept_[0]
    
    # Sort features by absolute coefficient value
    sorted_indices = np.argsort(np.abs(coefficients))[::-1]
    sorted_features = [feature_names[i] for i in sorted_indices]
    sorted_coefficients = coefficients[sorted_indices]
    
    # Generate rules
    rules = []
    
    rules.append(f"Prediction formula: log-odds = {intercept:.4f}")
    
    for feature, coef in zip(sorted_features, sorted_coefficients):
        if coef > 0:
            rules.append(f"  + {coef:.4f} × {feature}")
        else:
            rules.append(f"  - {abs(coef):.4f} × {feature}")
    
    rules.append(f"\nDecision rule: Driver will start driving if log-odds > {np.log(threshold/(1-threshold)):.4f}")
    
    # Generate simplified rules for top features
    simplified_rules = ["\nSimplified rules:"]
    
    top_positive = [(feature, coef) for feature, coef in zip(sorted_features, sorted_coefficients) if coef > 0][:3]
    top_negative = [(feature, coef) for feature, coef in zip(sorted_features, sorted_coefficients) if coef < 0][:3]
    
    simplified_rules.append("Factors that increase likelihood of driving:")
    for feature, coef in top_positive:
        simplified_rules.append(f"  - {feature} (weight: +{coef:.4f})")
    
    simplified_rules.append("\nFactors that decrease likelihood of driving:")
    for feature, coef in top_negative:
        simplified_rules.append(f"  - {feature} (weight: {coef:.4f})")
    
    return "\n".join(rules + simplified_rules)

def main(file_path):
    """
    Main function to run the predictive modeling pipeline.
    
    Parameters:
    file_path (str): Path to the CSV file
    
    Returns:
    None
    """
    print("Loading and preparing data...")
    X, y, feature_names = load_and_prepare_data(file_path)
    
    print("\nBuilding and evaluating models...")
    results, X_test, y_test = build_and_evaluate_models(X, y, feature_names)
    
    print("\nVisualizing model results...")
    visualize_model_results(results, X_test, y_test)
    
    # Find the best model based on F1 score
    best_model_name = max([(name, result['f1']) for name, result in results.items() 
                         if name not in ['feature_importance', 'rf_importance']], 
                        key=lambda x: x[1])[0]
    
    print(f"\nBest model: {best_model_name} (F1: {results[best_model_name]['f1']:.4f})")
    
    print("\nTuning the best model...")
    best_params, tuned_model, X_test, y_test = tune_best_model(X, y, best_model_name)
    
    # Extract and display decision rules if logistic regression
    if best_model_name == 'Logistic Regression':
        # Get the feature names after preprocessing
        preprocessor = tuned_model.named_steps['preprocessor']
        classifier = tuned_model.named_steps['classifier']
        
        try:
            cat_features = preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(
                [col for col in X.columns if X[col].dtype == 'object'])
            
            all_features = list(cat_features) + [col for col in X.columns if X[col].dtype != 'object']
            
            print("\nDecision Rules:")
            rules = generate_decision_rules(classifier, all_features)
            print(rules)
            
            # Save rules to file
            with open('uber_driver_decision_rules.txt', 'w') as f:
                f.write(rules)
        except:
            print("Could not extract decision rules.")
    
    print("\nAnalysis complete. Visualizations saved to files.")

# Example usage
if __name__ == "__main__":
    main('data1.csv')  # Replace with your file path

Loading and preparing data...

Building and evaluating models...
Training Logistic Regression...
Accuracy: 0.8650
Precision: 0.4533
Recall: 0.9603
F1 Score: 0.6159
ROC AUC: 0.9555
PR AUC: 0.7101
Confusion Matrix:
[[8276 1428]
 [  49 1184]]


Training Random Forest...
Accuracy: 0.8901
Precision: 0.5078
Recall: 0.8216
F1 Score: 0.6276
ROC AUC: 0.9241
PR AUC: 0.6340
Confusion Matrix:
[[8722  982]
 [ 220 1013]]


Training Gradient Boosting...
Accuracy: 0.9273
Precision: 0.7086
Recall: 0.6034
F1 Score: 0.6518
ROC AUC: 0.9574
PR AUC: 0.7273
Confusion Matrix:
[[9398  306]
 [ 489  744]]


Logistic Regression Feature Importance:
                    Feature  Importance
13            bgc_completed    5.547618
15         has_vehicle_info    2.739734
14            vehicle_added    2.169721
17    bgc_completed_quickly    1.093051
16    vehicle_added_quickly    1.039791
18       bgc_before_vehicle    0.931140
9          city_name_Berton    0.505974
6           signup_os_other    0.446545
2   signup_c

In [15]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as patches

# Set style for better visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['figure.figsize'] = (14, 12)
plt.rcParams['figure.dpi'] = 100

# Confusion matrices for each model
confusion_matrices = {
    'Logistic Regression': np.array([[8276, 1428], [49, 1184]]),
    'Random Forest': np.array([[8722, 982], [220, 1013]]),
    'Gradient Boosting': np.array([[9398, 306], [489, 744]]),
    'Tuned Gradient Boosting': np.array([[9361, 343], [430, 803]])
}

def create_enhanced_confusion_matrix(model_name, cm):
    """
    Create an enhanced, visually appealing confusion matrix visualization
    with business context and explanations.
    
    Parameters:
    model_name (str): Name of the model
    cm (numpy.ndarray): Confusion matrix values
    """
    # Create figure
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Calculate percentages and totals
    total = cm.sum()
    tn, fp = cm[0]
    fn, tp = cm[1]
    
    # Calculate overall metrics
    accuracy = (tp + tn) / total
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Create a custom colormap (blue to green gradient)
    colors = ['#f0f9ff', '#c6e2ff', '#93c4ff', '#5da7f5', '#1976D2']
    cmap = LinearSegmentedColormap.from_list('blue_gradient', colors)
    
    # Create a normalized confusion matrix for coloring
    norm_cm = cm / total
    
    # Create the heatmap without annotations first
    sns.heatmap(cm, annot=False, cmap=cmap, cbar=False, ax=ax)
    
    # Add text manually with enhanced formatting
    fontsize_value = 16
    fontsize_pct = 14
    fontsize_label = 12
    
    # Add count and percentage for each cell
    labels = [['True Negative (TN)', 'False Positive (FP)'], 
              ['False Negative (FN)', 'True Positive (TP)']]
    
    explanations = [
        [
            "Driver correctly predicted\nNOT to take first trip",
            "Driver predicted to take\nfirst trip but did NOT"
        ],
        [
            "Driver predicted NOT to take\nfirst trip but DID take it",
            "Driver correctly predicted\nto take first trip"
        ]
    ]
    
    business_impact = [
        [
            "Correctly identified non-converters\nCan safely exclude from interventions",
            "Wasted resources on interventions\nfor drivers who wouldn't convert"
        ],
        [
            "Missed opportunity to convert\nthese drivers with interventions",
            "Correctly targeted drivers\nwho will convert"
        ]
    ]
    
    for i in range(2):
        for j in range(2):
            # Calculate percentage of total
            val = cm[i, j]
            pct = val / total * 100
            
            # Position text in the middle of the cell
            y, x = i + 0.5, j + 0.5
            
            # Add cell label
            ax.text(x, y - 0.30, labels[i][j], ha='center', va='center', 
                   fontsize=fontsize_label, fontweight='bold', 
                   color='black' if norm_cm[i, j] < 0.5 else 'white')
            
            # Add count value
            ax.text(x, y, f"{val:,}", ha='center', va='center', 
                   fontsize=fontsize_value, fontweight='bold', 
                   color='black' if norm_cm[i, j] < 0.5 else 'white')
            
            # Add percentage
            ax.text(x, y + 0.24, f"({pct:.1f}%)", ha='center', va='center', 
                   fontsize=fontsize_pct, 
                   color='black' if norm_cm[i, j] < 0.5 else 'white')
            
            # Add explanation
            ax.text(x, y + 0.42, explanations[i][j], ha='center', va='center', 
                   fontsize=10, style='italic', 
                   color='black' if norm_cm[i, j] < 0.4 else 'white')
    
    # Set axis labels and title
    ax.set_xlabel('Predicted Label', fontsize=14, fontweight='bold')
    ax.set_ylabel('Actual Label', fontsize=14, fontweight='bold')
    plt.title(f'Confusion Matrix: {model_name}', fontsize=18, fontweight='bold', pad=20)
    
    # Set custom tick labels
    ax.set_xticklabels(['Did Not Convert', 'Converted'], fontsize=12)
    ax.set_yticklabels(['Did Not Convert', 'Converted'], fontsize=12)
    
    # Add a summary box with metrics
    metrics_box = (
        f"Model Performance Metrics:\n"
        f"Accuracy: {accuracy:.1%}\n"
        f"Precision: {precision:.1%}\n"
        f"Recall: {recall:.1%}\n"
        f"F1 Score: {f1:.1%}\n\n"
        f"Total Drivers: {total:,}"
    )
    
    # Add metrics box
    plt.figtext(0.92, 0.5, metrics_box, 
               bbox=dict(facecolor='#f0f0f0', alpha=0.9, boxstyle='round,pad=0.5', 
                        edgecolor='#cccccc'),
               fontsize=12, ha='center')
    
    # Add business implications section under the matrix
    implications = (
        "Business Implications:\n"
        f"• {tp:,} drivers ({tp/total:.1%}) were correctly identified as converters (True Positives)\n"
        f"• {tn:,} drivers ({tn/total:.1%}) were correctly identified as non-converters (True Negatives)\n"
        f"• {fp:,} drivers ({fp/total:.1%}) received unnecessary interventions (False Positives)\n"
        f"• {fn:,} drivers ({fn/total:.1%}) were missed opportunities for conversion (False Negatives)"
    )
    
    plt.figtext(0.5, 0.08, implications, ha='center', fontsize=12,
              bbox=dict(facecolor='#eaf2f8', alpha=0.9, boxstyle='round,pad=0.5',
                       edgecolor='#a9cce3'))
    
    # Add a title for quadrant explanations
    plt.figtext(0.5, 0.15, "What Each Quadrant Means for Uber's Driver Conversion", 
              ha='center', fontsize=14, fontweight='bold')
    
    plt.tight_layout(rect=[0, 0.2, 0.85, 0.95])  # Adjust layout to make room for annotations
    
    # Save the figure
    plt.savefig(f'uber_confusion_matrix_{model_name.replace(" ", "_").lower()}.png', 
               dpi=300, bbox_inches='tight')
    plt.close()


def create_comparison_of_confusion_matrices():
    """Create a 2x2 grid comparing confusion matrices for all models."""
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    axes = axes.flatten()
    
    class_names = ['Did Not Convert', 'Converted']
    
    # Create a custom colormap (blue to green gradient)
    colors = ['#f0f9ff', '#c6e2ff', '#93c4ff', '#5da7f5', '#1976D2']
    cmap = LinearSegmentedColormap.from_list('blue_gradient', colors)
    
    for i, (model_name, cm) in enumerate(confusion_matrices.items()):
        ax = axes[i]
        
        # Calculate metrics
        total = cm.sum()
        tn, fp = cm[0]
        fn, tp = cm[1]
        
        accuracy = (tp + tn) / total
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        # Create the heatmap
        sns.heatmap(cm, annot=True, fmt=',d', cmap=cmap, cbar=False, ax=ax)
        
        # Set labels
        ax.set_xlabel('Predicted Label', fontsize=12)
        ax.set_ylabel('Actual Label', fontsize=12)
        ax.set_title(f'{model_name}', fontsize=14, fontweight='bold')
        
        # Set tick labels
        ax.set_xticklabels(class_names, fontsize=10)
        ax.set_yticklabels(class_names, fontsize=10)
        
        # Add metrics
        metrics_text = (
            f"Accuracy: {accuracy:.1%}\n"
            f"Precision: {precision:.1%}\n"
            f"Recall: {recall:.1%}\n"
            f"F1 Score: {f1:.1%}"
        )
        
        ax.text(1.5, 1.5, metrics_text, ha='center', va='center', 
               bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.3'),
               fontsize=10)
    
    plt.tight_layout()
    plt.suptitle('Confusion Matrix Comparison Across Models', fontsize=18, fontweight='bold', y=1.02)
    plt.savefig('uber_confusion_matrix_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_business_impact_viz(model_name='Tuned Gradient Boosting'):
    """Create a visualization focusing on the business impact of model predictions."""
    # Get confusion matrix for the specified model
    cm = confusion_matrices[model_name]
    tn, fp = cm[0]
    fn, tp = cm[1]
    total = cm.sum()
    
    # Calculate key metrics
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    # Create figure
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # Define the quadrants
    width, height = 10, 10
    
    # Create the quadrants
    quadrants = [
        patches.Rectangle((0, 0), width/2, height/2, linewidth=1, edgecolor='black', facecolor='#c8e6c9'),  # TN
        patches.Rectangle((width/2, 0), width/2, height/2, linewidth=1, edgecolor='black', facecolor='#ffcdd2'),  # FP
        patches.Rectangle((0, height/2), width/2, height/2, linewidth=1, edgecolor='black', facecolor='#ffecb3'),  # FN
        patches.Rectangle((width/2, height/2), width/2, height/2, linewidth=1, edgecolor='black', facecolor='#bbdefb')  # TP
    ]
    
    # Add the quadrants to the plot
    for quadrant in quadrants:
        ax.add_patch(quadrant)
    
    # Calculate percentages and counts
    tn_pct = tn / total * 100
    fp_pct = fp / total * 100
    fn_pct = fn / total * 100
    tp_pct = tp / total * 100
    
    # Add quadrant labels and statistics
    quadrant_info = [
        {
            'position': (width/4, height/4),
            'title': 'True Negatives',
            'count': tn,
            'percent': tn_pct,
            'description': 'Drivers correctly predicted\nNOT to take first trip',
            'business_impact': 'No action needed - these drivers\nwould not convert anyway'
        },
        {
            'position': (width*3/4, height/4),
            'title': 'False Positives',
            'count': fp,
            'percent': fp_pct,
            'description': 'Drivers predicted to take\nfirst trip but did NOT',
            'business_impact': 'Resources wasted on\nunnecessary interventions'
        },
        {
            'position': (width/4, height*3/4),
            'title': 'False Negatives',
            'count': fn,
            'percent': fn_pct,
            'description': 'Drivers predicted NOT to take\nfirst trip but DID take it',
            'business_impact': 'Missed opportunity to help\nthese drivers convert faster'
        },
        {
            'position': (width*3/4, height*3/4),
            'title': 'True Positives',
            'count': tp,
            'percent': tp_pct,
            'description': 'Drivers correctly predicted\nto take first trip',
            'business_impact': 'Correctly targeted\nfor conversion support'
        }
    ]
    
    # Add the information to each quadrant
    for info in quadrant_info:
        x, y = info['position']
        
        # Add title
        ax.text(x, y+1.5, info['title'], ha='center', va='center', fontsize=14, fontweight='bold')
        
        # Add count and percentage
        ax.text(x, y+0.5, f"{info['count']:,} drivers ({info['percent']:.1f}%)", 
                ha='center', va='center', fontsize=12)
        
        # Add description
        ax.text(x, y-0.5, info['description'], ha='center', va='center', fontsize=10, style='italic')
        
        # Add business impact
        ax.text(x, y-1.5, info['business_impact'], ha='center', va='center', fontsize=10, 
                bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.2'))
    
    # Add axis labels
    ax.text(width/4, -0.7, 'Predicted: Will NOT Convert', ha='center', va='center', fontsize=12)
    ax.text(width*3/4, -0.7, 'Predicted: Will Convert', ha='center', va='center', fontsize=12)
    ax.text(-0.7, height/4, 'Actual: Did NOT Convert', ha='center', va='center', fontsize=12, rotation=90)
    ax.text(-0.7, height*3/4, 'Actual: Did Convert', ha='center', va='center', fontsize=12, rotation=90)
    
    # Set plot limits
    ax.set_xlim(-2, width+2)
    ax.set_ylim(-2, height+2)
    
    # Remove axes
    ax.set_axis_off()
    
    # Add title
    plt.suptitle(f'Business Impact of {model_name} Predictions', fontsize=16, fontweight='bold', y=0.98)
    
    # Add subtitle with precision and recall explanation
    subtitle = (
        f"Precision: {precision:.1%} of drivers predicted to convert actually did (TP / (TP + FP))\n"
        f"Recall: {recall:.1%} of drivers who actually converted were predicted correctly (TP / (TP + FN))"
    )
    plt.title(subtitle, fontsize=12, pad=20)
    
    # Add ROI information
    roi_text = (
        "Potential Business Impact:\n\n"
        f"• By correctly identifying {tp:,} converters, Uber can provide targeted support to its most promising drivers\n"
        f"• By avoiding {tn:,} likely non-converters, Uber can save resources and focus efforts efficiently\n"
        f"• The {fn:,} missed converters represent an opportunity to improve the model and capture more drivers\n"
        f"• The {fp:,} false positives represent wasted resources that could be better allocated"
    )
    
    plt.figtext(0.5, 0.05, roi_text, ha='center', fontsize=12,
               bbox=dict(facecolor='#eaf2f8', alpha=0.9, boxstyle='round,pad=0.5',
                         edgecolor='#a9cce3', linewidth=2))
    
    plt.tight_layout(rect=[0, 0.12, 1, 0.92])
    plt.savefig(f'uber_confusion_matrix_business_impact.png', dpi=300, bbox_inches='tight')
    plt.close()

# Create all visualizations
if __name__ == "__main__":
    # Create enhanced confusion matrix for each model
    for model_name, cm in confusion_matrices.items():
        create_enhanced_confusion_matrix(model_name, cm)
    
    # Create comparison visualization
    create_comparison_of_confusion_matrices()
    
    # Create business impact visualization
    create_business_impact_viz()
    
    print("All confusion matrix visualizations saved successfully!")

All confusion matrix visualizations saved successfully!


In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.gridspec as gridspec

# Set style for better visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100

# Create data for missing values
missing_data = {
    'column': ['signup_os', 'bgc_date', 'vehicle_added_date', 'vehicle_make', 'vehicle_model', 'vehicle_year', 'first_completed_date'],
    'missing_count': [6857, 21785, 41547, 41458, 41458, 41458, 48544],
    'total': [54681] * 7
}

# Create conversion rate data
conversion_data = {
    'bgc_completed': {'Yes': 18.66, 'No': 0.00},
    'vehicle_added': {'Yes': 44.71, 'No': 0.64},
    'signup_channel': {'Organic': 9.01, 'Paid': 6.19, 'Referral': 19.89},
    'funnel_completion': {
        'BGC Only': 1.32,
        'BGC and Vehicle': 45.59,
        'No BGC, No Vehicle': 0.00,
        'Vehicle Only': 0.00
    }
}

# Create feature importance data
feature_importance = {
    'Feature': ['bgc_completed', 'has_vehicle_info', 'vehicle_added', 
                'signup_channel_Referral', 'signup_channel_Paid', 'signup_channel_Organic',
                'city_name_Berton', 'city_name_Strark', 'city_name_Wrouver'],
    'Importance': [4.493994, 2.191576, 1.847814, 0.305712, 0.210220, 0.095577, 
                  0.065396, 0.054656, 0.010825]
}

# Model performance data
model_performance = {
    'Accuracy': 0.8933,
    'Precision': 0.5258,
    'Recall': 0.5450,
    'F1 Score': 0.5352
}

# Confusion matrix data
confusion_matrix = np.array([[9098, 606], [561, 672]])

def create_missing_data_visualization():
    """Create a visualization of missing data."""
    # Convert to DataFrame
    df = pd.DataFrame(missing_data)
    df['missing_percentage'] = df['missing_count'] / df['total'] * 100
    
    # Sort by missing percentage
    df = df.sort_values('missing_percentage', ascending=False)
    
    # Create figure
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Create bars
    bars = ax.barh(df['column'], df['missing_percentage'], color=sns.color_palette("viridis", len(df)))
    
    # Add value labels
    for i, bar in enumerate(bars):
        count = df.iloc[i]['missing_count']
        pct = df.iloc[i]['missing_percentage']
        ax.text(pct + 1, i, f"{int(count):,} ({pct:.1f}%)", 
                va='center', fontsize=10, fontweight='bold')
    
    # Customize plot
    ax.set_xlabel('Percentage of Missing Values', fontsize=12)
    ax.set_ylabel('Column', fontsize=12)
    ax.set_title('Missing Data in Uber Driver Dataset', fontsize=16, fontweight='bold')
    ax.set_xlim(0, 100)
    ax.grid(axis='x', linestyle='--', alpha=0.7)
    
    # Add a vertical line at 50%
    ax.axvline(x=50, color='red', linestyle='--', alpha=0.7)
    
    # Add annotations for missing data patterns
    high_missing = df[df['missing_percentage'] > 75]['column'].tolist()
    if high_missing:
        high_missing_text = ", ".join(high_missing)
        plt.figtext(0.5, 0.01, f"Columns with >75% missing data: {high_missing_text}\nMost of these represent conversion steps most drivers don't complete",
                   ha="center", fontsize=10, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})
    
    plt.tight_layout(rect=[0, 0.05, 1, 0.98])
    plt.savefig('uber_driver_missing_data.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_conversion_rate_visualization():
    """Create visualizations for conversion rates by different factors."""
    # Create figure with subplots
    fig = plt.figure(figsize=(16, 12))
    gs = gridspec.GridSpec(2, 2, figure=fig)
    
    # 1. BGC Completion
    ax1 = fig.add_subplot(gs[0, 0])
    bgc_labels = list(conversion_data['bgc_completed'].keys())
    bgc_values = list(conversion_data['bgc_completed'].values())
    
    bars1 = ax1.bar(bgc_labels, bgc_values, color=['#1976D2', '#64B5F6'])
    
    # Add value labels
    for bar in bars1:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
                f"{height:.2f}%", ha='center', va='bottom', fontweight='bold')
    
    ax1.set_ylim(0, 100)
    ax1.set_title('Conversion Rate by Background Check Status', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Conversion Rate (%)', fontsize=12)
    ax1.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add annotation
    ax1.text(0.5, 50, "No drivers without\nBGC convert", 
             ha='center', va='center', fontsize=12, 
             bbox=dict(facecolor='red', alpha=0.1, boxstyle='round,pad=0.5'))
    
    # 2. Vehicle Addition
    ax2 = fig.add_subplot(gs[0, 1])
    vehicle_labels = list(conversion_data['vehicle_added'].keys())
    vehicle_values = list(conversion_data['vehicle_added'].values())
    
    bars2 = ax2.bar(vehicle_labels, vehicle_values, color=['#388E3C', '#81C784'])
    
    # Add value labels
    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 1,
                f"{height:.2f}%", ha='center', va='bottom', fontweight='bold')
    
    ax2.set_ylim(0, 50)
    ax2.set_title('Conversion Rate by Vehicle Addition Status', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Conversion Rate (%)', fontsize=12)
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add annotation
    ax2.text(1, 35, "70x higher conversion\nwith vehicle added", 
             ha='center', va='center', fontsize=12, 
             bbox=dict(facecolor='green', alpha=0.1, boxstyle='round,pad=0.5'))
    
    # 3. Signup Channel
    ax3 = fig.add_subplot(gs[1, 0])
    channel_labels = list(conversion_data['signup_channel'].keys())
    channel_values = list(conversion_data['signup_channel'].values())
    
    # Sort by conversion rate
    sorted_indices = np.argsort(channel_values)[::-1]
    channel_labels = [channel_labels[i] for i in sorted_indices]
    channel_values = [channel_values[i] for i in sorted_indices]
    
    colors = ['#7B1FA2', '#9C27B0', '#BA68C8']
    bars3 = ax3.bar(channel_labels, channel_values, color=colors)
    
    # Add value labels
    for bar in bars3:
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f"{height:.2f}%", ha='center', va='bottom', fontweight='bold')
    
    ax3.set_ylim(0, 25)
    ax3.set_title('Conversion Rate by Signup Channel', fontsize=14, fontweight='bold')
    ax3.set_ylabel('Conversion Rate (%)', fontsize=12)
    ax3.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add annotation
    ax3.text(0, 20, "Referrals convert at 3x\nthe rate of paid channels", 
             ha='center', va='center', fontsize=12, 
             bbox=dict(facecolor='purple', alpha=0.1, boxstyle='round,pad=0.5'))
    
    # 4. Funnel Completion
    ax4 = fig.add_subplot(gs[1, 1])
    funnel_labels = list(conversion_data['funnel_completion'].keys())
    funnel_values = list(conversion_data['funnel_completion'].values())
    
    # Sort by conversion rate
    sorted_indices = np.argsort(funnel_values)[::-1]
    funnel_labels = [funnel_labels[i] for i in sorted_indices]
    funnel_values = [funnel_values[i] for i in sorted_indices]
    
    colors = ['#E65100', '#FB8C00', '#FFB74D', '#FFE0B2']
    bars4 = ax4.bar(funnel_labels, funnel_values, color=colors)
    
    # Add value labels
    for bar in bars4:
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2., height + 1,
                f"{height:.2f}%", ha='center', va='bottom', fontweight='bold')
    
    ax4.set_ylim(0, 50)
    ax4.set_title('Conversion Rate by Funnel Completion', fontsize=14, fontweight='bold')
    ax4.set_ylabel('Conversion Rate (%)', fontsize=12)
    ax4.set_xticklabels(funnel_labels, rotation=45, ha='right')
    ax4.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add annotation
    ax4.text(0, 40, "Both steps are critical\nfor conversion", 
             ha='center', va='center', fontsize=12, 
             bbox=dict(facecolor='orange', alpha=0.1, boxstyle='round,pad=0.5'))
    
    plt.tight_layout()
    plt.suptitle('Uber Driver Conversion Rates by Different Factors', fontsize=18, fontweight='bold', y=1.02)
    plt.savefig('uber_driver_conversion_rates.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_feature_importance_visualization():
    """Create a visualization of feature importance."""
    # Convert to DataFrame
    df = pd.DataFrame(feature_importance)
    df = df.sort_values('Importance', ascending=True)
    
    # Create figure
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Create bars with a color gradient
    colors = plt.cm.viridis(np.linspace(0.1, 0.9, len(df)))
    bars = ax.barh(df['Feature'], df['Importance'], color=colors)
    
    # Add value labels
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax.text(width + 0.1, i, f"{width:.2f}", va='center', fontsize=10, fontweight='bold')
    
    # Customize plot
    ax.set_xlabel('Importance Score', fontsize=12)
    ax.set_ylabel('Feature', fontsize=12)
    ax.set_title('Feature Importance in Driver Conversion Prediction', fontsize=16, fontweight='bold')
    ax.grid(axis='x', linestyle='--', alpha=0.7)
    
    # Add annotations for feature groups
    plt.figtext(0.5, 0.01, 
               "Key Insights:\n"
               "• Completing onboarding steps (BGC, vehicle) are by far the strongest predictors\n"
               "• Referral channel is the most important acquisition source\n"
               "• City has minimal impact on conversion likelihood",
               ha="center", fontsize=12, bbox={"facecolor":"lightblue", "alpha":0.2, "pad":5})
    
    plt.tight_layout(rect=[0, 0.07, 1, 0.98])
    plt.savefig('uber_driver_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_model_performance_visualization():
    """Create visualizations for model performance."""
    # Create figure with subplots
    fig = plt.figure(figsize=(16, 8))
    gs = gridspec.GridSpec(1, 2, figure=fig, width_ratios=[1, 2])
    
    # 1. Performance Metrics
    ax1 = fig.add_subplot(gs[0, 0])
    
    metrics = list(model_performance.keys())
    values = list(model_performance.values())
    
    # Create bars with a color gradient
    colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(metrics)))
    bars = ax1.bar(metrics, values, color=colors)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f"{height:.2f}", ha='center', va='bottom', fontweight='bold')
    
    ax1.set_ylim(0, 1)
    ax1.set_title('Model Performance Metrics', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Score', fontsize=12)
    ax1.grid(axis='y', linestyle='--', alpha=0.7)
    
    # 2. Confusion Matrix
    ax2 = fig.add_subplot(gs[0, 1])
    
    # Create a normalized confusion matrix
    cm_norm = confusion_matrix / confusion_matrix.sum()
    
    # Create a custom colormap (blue gradient)
    colors = ['#f0f9ff', '#c6e2ff', '#93c4ff', '#5da7f5', '#1976D2']
    cmap = LinearSegmentedColormap.from_list('blue_gradient', colors)
    
    # Create the heatmap
    sns.heatmap(confusion_matrix, annot=True, fmt=',d', cmap=cmap, cbar=False, ax=ax2)
    
    # Add percentage annotations
    for i in range(2):
        for j in range(2):
            ax2.text(j + 0.5, i + 0.7, f"({cm_norm[i, j]*100:.1f}%)", 
                    ha='center', va='center', fontsize=10)
    
    # Set labels
    ax2.set_xlabel('Predicted Label', fontsize=12)
    ax2.set_ylabel('Actual Label', fontsize=12)
    ax2.set_title('Confusion Matrix', fontsize=14, fontweight='bold')
    
    # Set tick labels
    ax2.set_xticklabels(['Did Not Convert', 'Converted'], fontsize=10)
    ax2.set_yticklabels(['Did Not Convert', 'Converted'], fontsize=10)
    
    # Add explanatory labels for quadrants
    ax2.text(0.5, 0.3, "True Negatives", ha='center', va='center', fontsize=9, 
             color='black', weight='bold')
    ax2.text(1.5, 0.3, "False Positives", ha='center', va='center', fontsize=9, 
             color='black', weight='bold')
    ax2.text(0.5, 1.3, "False Negatives", ha='center', va='center', fontsize=9, 
             color='black', weight='bold')
    ax2.text(1.5, 1.3, "True Positives", ha='center', va='center', fontsize=9, 
             color='black', weight='bold')
    
    plt.tight_layout()
    plt.suptitle('Uber Driver Conversion Model Performance', fontsize=18, fontweight='bold', y=1.02)
    plt.savefig('uber_driver_model_performance.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_funnel_visualization():
    """Create a conversion funnel visualization."""
    total_drivers = 54681
    drivers_with_bgc = 32896
    drivers_with_vehicle = 13134
    drivers_with_both = 12879
    converted_drivers = 6137
    
    # Calculate percentages
    bgc_pct = drivers_with_bgc / total_drivers * 100
    vehicle_pct = drivers_with_vehicle / total_drivers * 100
    both_pct = drivers_with_both / total_drivers * 100
    converted_pct = converted_drivers / total_drivers * 100
    
    # Calculate conversion rate for drivers with both BGC and vehicle
    both_to_converted_pct = converted_drivers / drivers_with_both * 100
    
    # Create figure
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Define the stages
    stages = ['Total Signups', 'BGC Completed', 'Vehicle Added', 'Both Steps Completed', 'First Trip Completed']
    values = [total_drivers, drivers_with_bgc, drivers_with_vehicle, drivers_with_both, converted_drivers]
    percentages = [100, bgc_pct, vehicle_pct, both_pct, converted_pct]
    
    # Create a gradient of colors
    colors = plt.cm.viridis(np.linspace(0.8, 0.3, len(stages)))
    
    # Create bars
    y_pos = np.arange(len(stages))
    bars = ax.barh(y_pos, percentages, color=colors)
    
    # Add stage labels and values
    for i, (bar, value, pct) in enumerate(zip(bars, values, percentages)):
        # Add stage label
        ax.text(-5, i, stages[i], ha='right', va='center', fontsize=12, fontweight='bold')
        
        # Add count and percentage inside the bar
        ax.text(5, i, f"{value:,} ({pct:.1f}%)", va='center', fontsize=10, 
                color='white' if i > 0 else 'black')
    
    # Customize the plot
    ax.set_xlim(0, 105)
    ax.set_yticks([])
    ax.set_xlabel('Percentage of Initial Signups', fontsize=12)
    ax.set_title('Driver Conversion Funnel', fontsize=16, fontweight='bold')
    ax.grid(axis='x', linestyle='--', alpha=0.7)
    
    # Add conversion rate annotation
    plt.figtext(0.5, 0.05, 
               f"Key Insight: Of drivers who complete both BGC and vehicle addition,\n{both_to_converted_pct:.1f}% go on to complete their first trip",
               ha="center", fontsize=12, bbox={"facecolor":"lightgreen", "alpha":0.3, "pad":5})
    
    plt.tight_layout(rect=[0.05, 0.07, 1, 0.98])
    plt.savefig('uber_driver_conversion_funnel.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_combined_dashboard():
    """Create a combined dashboard with all visualizations."""
    # Create figure with a complex grid layout
    fig = plt.figure(figsize=(20, 16))
    gs = gridspec.GridSpec(3, 3, figure=fig, height_ratios=[1, 1, 1.2])
    
    # 1. Missing Data Visualization (top left)
    ax1 = fig.add_subplot(gs[0, 0])
    create_missing_data_mini(ax1)
    
    # 2. Conversion Funnel (top center and right)
    ax2 = fig.add_subplot(gs[0, 1:])
    create_funnel_mini(ax2)
    
    # 3. Conversion by BGC and Vehicle (middle left)
    ax3 = fig.add_subplot(gs[1, 0])
    create_bgc_vehicle_mini(ax3)
    
    # 4. Conversion by Channel (middle center)
    ax4 = fig.add_subplot(gs[1, 1])
    create_channel_mini(ax4)
    
    # 5. Funnel Completion (middle right)
    ax5 = fig.add_subplot(gs[1, 2])
    create_funnel_completion_mini(ax5)
    
    # 6. Feature Importance (bottom left and center)
    ax6 = fig.add_subplot(gs[2, :2])
    create_feature_importance_mini(ax6)
    
    # 7. Model Performance (bottom right)
    ax7 = fig.add_subplot(gs[2, 2])
    create_model_performance_mini(ax7)
    
    plt.tight_layout()
    plt.suptitle('Uber Driver Conversion Analysis Dashboard', fontsize=20, fontweight='bold', y=0.98)
    
    # Add explanatory text at the bottom
    plt.figtext(0.5, 0.01, 
               "Key Insights:\n"
               "• Only 11.22% of driver signups complete their first trip\n"
               "• Background check (BGC) and vehicle addition are essential steps - no drivers convert without BGC\n"
               "• Referral channel produces the highest quality signups (19.89% conversion vs. 6-9% for other channels)\n"
               "• Drivers who complete both BGC and vehicle steps convert at 45.59%, showing the importance of funnel completion\n"
               "• The predictive model achieves 89.33% accuracy, helping identify which drivers are likely to convert",
               ha="center", fontsize=12, bbox={"facecolor":"#f0f0f0", "alpha":0.9, "pad":5})
    
    plt.savefig('uber_driver_analysis_dashboard.png', dpi=300, bbox_inches='tight')
    plt.close()

# Mini visualization functions for the dashboard
def create_missing_data_mini(ax):
    """Create a mini visualization of missing data."""
    df = pd.DataFrame(missing_data)
    df['missing_percentage'] = df['missing_count'] / df['total'] * 100
    df = df.sort_values('missing_percentage', ascending=False)
    
    bars = ax.barh(df['column'], df['missing_percentage'], color=sns.color_palette("viridis", len(df)))
    
    ax.set_xlabel('% Missing', fontsize=10)
    ax.set_title('Missing Data', fontsize=12, fontweight='bold')
    ax.set_xlim(0, 100)
    
    # Remove ytick labels
    ax.set_yticklabels(df['column'], fontsize=8)

def create_funnel_mini(ax):
    """Create a mini conversion funnel visualization."""
    total_drivers = 54681
    drivers_with_bgc = 32896
    drivers_with_vehicle = 13134
    drivers_with_both = 12879
    converted_drivers = 6137
    
    # Calculate percentages
    bgc_pct = drivers_with_bgc / total_drivers * 100
    vehicle_pct = drivers_with_vehicle / total_drivers * 100
    both_pct = drivers_with_both / total_drivers * 100
    converted_pct = converted_drivers / total_drivers * 100
    
    # Define the stages
    stages = ['Total Signups', 'BGC Completed', 'Vehicle Added', 'Both Steps Completed', 'First Trip Completed']
    values = [total_drivers, drivers_with_bgc, drivers_with_vehicle, drivers_with_both, converted_drivers]
    percentages = [100, bgc_pct, vehicle_pct, both_pct, converted_pct]
    
    # Create a gradient of colors
    colors = plt.cm.viridis(np.linspace(0.8, 0.3, len(stages)))
    
    # Create bars
    y_pos = np.arange(len(stages))
    bars = ax.barh(y_pos, percentages, color=colors)
    
    # Add count and percentage inside the bar
    for i, (bar, value, pct) in enumerate(zip(bars, values, percentages)):
        ax.text(5, i, f"{value:,} ({pct:.1f}%)", va='center', fontsize=7, 
                color='white' if i > 0 else 'black')
    
    ax.set_yticks(y_pos)
    ax.set_yticklabels(stages, fontsize=9)
    ax.set_xlim(0, 105)
    ax.set_title('Driver Conversion Funnel', fontsize=12, fontweight='bold')
    ax.grid(axis='x', linestyle='--', alpha=0.7)

def create_bgc_vehicle_mini(ax):
    """Create mini visualizations for BGC and vehicle conversion rates."""
    # BGC Completion
    bgc_labels = list(conversion_data['bgc_completed'].keys())
    bgc_values = list(conversion_data['bgc_completed'].values())
    vehicle_labels = list(conversion_data['vehicle_added'].keys())
    vehicle_values = list(conversion_data['vehicle_added'].values())
    
    # Combine data
    labels = bgc_labels + vehicle_labels
    values = bgc_values + vehicle_values
    colors = ['#1976D2', '#64B5F6', '#388E3C', '#81C784']
    
    # Add group column
    group = ['BGC'] * len(bgc_labels) + ['Vehicle'] * len(vehicle_labels)
    
    # Create grouped bar chart
    x = np.arange(len(set(group)))
    width = 0.35
    
    # BGC bars
    ax.bar(x[0] - width/2, bgc_values[0], width, label=bgc_labels[0], color=colors[0])
    ax.bar(x[0] + width/2, bgc_values[1], width, label=bgc_labels[1], color=colors[1])
    
    # Vehicle bars
    ax.bar(x[1] - width/2, vehicle_values[0], width, label=vehicle_labels[0], color=colors[2])
    ax.bar(x[1] + width/2, vehicle_values[1], width, label=vehicle_labels[1], color=colors[3])
    
    ax.set_xticks(x)
    ax.set_xticklabels(['Background Check', 'Vehicle Addition'])
    ax.set_ylim(0, 50)
    ax.set_ylabel('Conversion Rate (%)')
    ax.set_title('Conversion by Onboarding Steps', fontsize=12, fontweight='bold')
    
    # Add value labels
    for i, v in enumerate([bgc_values[0], bgc_values[1], vehicle_values[0], vehicle_values[1]]):
        x_pos = (i % 2) * 1 + (0.5 if i % 2 else -0.5) * width
        ax.text(x_pos, v + 1, f"{v:.1f}%", ha='center', va='bottom', fontsize=8)
    
    ax.grid(axis='y', linestyle='--', alpha=0.7)

def create_channel_mini(ax):
    """Create a mini visualization for conversion by signup channel."""
    channel_labels = list(conversion_data['signup_channel'].keys())
    channel_values = list(conversion_data['signup_channel'].values())
    
    # Sort by conversion rate
    sorted_indices = np.argsort(channel_values)[::-1]
    channel_labels = [channel_labels[i] for i in sorted_indices]
    channel_values = [channel_values[i] for i in sorted_indices]
    
    colors = ['#7B1FA2', '#9C27B0', '#BA68C8']
    bars = ax.bar(channel_labels, channel_values, color=colors)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f"{height:.1f}%", ha='center', va='bottom', fontsize=9)
    
    ax.set_ylim(0, 25)
    ax.set_title('Conversion by Signup Channel', fontsize=12, fontweight='bold')
    ax.set_ylabel('Conversion Rate (%)')
    ax.grid(axis='y', linestyle='--', alpha=0.7)

def create_funnel_completion_mini(ax):
    """Create a mini visualization for conversion by funnel completion."""
    funnel_labels = list(conversion_data['funnel_completion'].keys())
    funnel_values = list(conversion_data['funnel_completion'].values())
    
    # Sort by conversion rate
    sorted_indices = np.argsort(funnel_values)[::-1]
    funnel_labels = [funnel_labels[i] for i in sorted_indices]
    funnel_values = [funnel_values[i] for i in sorted_indices]
    
    colors = ['#E65100', '#FB8C00', '#FFB74D', '#FFE0B2']
    bars = ax.bar(funnel_labels, funnel_values, color=colors)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 1,
                f"{height:.1f}%", ha='center', va='bottom', fontsize=8)
    
    ax.set_ylim(0, 50)
    ax.set_title('Conversion by Funnel Completion', fontsize=12, fontweight='bold')
    ax.set_ylabel('Conversion Rate (%)')
    ax.set_xticklabels(funnel_labels, rotation=45, ha='right', fontsize=8)
    ax.grid(axis='y', linestyle='--', alpha=0.7)

def create_feature_importance_mini(ax):
    """Create a mini visualization of feature importance."""
    df = pd.DataFrame(feature_importance)
    df = df.sort_values('Importance', ascending=True)
    
    colors = plt.cm.viridis(np.linspace(0.1, 0.9, len(df)))
    bars = ax.barh(df['Feature'], df['Importance'], color=colors)
    
    # Add value labels
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax.text(width + 0.1, i, f"{width:.2f}", va='center', fontsize=8)
    
    ax.set_xlabel('Importance Score')
    ax.set_title('Feature Importance in Prediction Model', fontsize=12, fontweight='bold')
    ax.grid(axis='x', linestyle='--', alpha=0.7)

def create_model_performance_mini(ax):
    """Create a mini visualization of model performance metrics."""
    metrics = list(model_performance.keys())
    values = list(model_performance.values())
    
    # Create bars with a color gradient
    colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(metrics)))
    bars = ax.bar(metrics, values, color=colors)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f"{height:.2f}", ha='center', va='bottom', fontsize=8)
    
    ax.set_ylim(0, 1)
    ax.set_title('Model Performance Metrics', fontsize=12, fontweight='bold')
    ax.set_xticklabels(metrics, rotation=45, ha='right', fontsize=9)
    ax.grid(axis='y', linestyle='--', alpha=0.7)

# Create all visualizations
if __name__ == "__main__":
    create_missing_data_visualization()
    create_conversion_rate_visualization()
    create_feature_importance_visualization()
    create_model_performance_visualization()
    create_funnel_visualization()
    create_combined_dashboard()
    
    print("All visualizations saved successfully!")

All visualizations saved successfully!


In [24]:
import matplotlib.pyplot as plt
import numpy as np

# Set style for better visualizations
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['figure.dpi'] = 300

# Define the funnel data
stages = [
    'Total Signups',
    'BGC Completed',
    'Vehicle Added',
    'Both Steps Completed',
    'First Trip Completed'
]

values = [54681, 32896, 13134, 12879, 6137]
percentages = [100.0, 60.2, 24.0, 23.6, 11.2]

# Create specific colors for each stage
bar_colors = ['#455a64', '#4caf50', '#26a69a', '#5c6bc0', '#9c27b0']

# Create the figure
fig, ax = plt.subplots(figsize=(14, 8))

# Create horizontal bars - reversed to show in descending order
y_pos = np.arange(len(stages))
bars = ax.barh(y_pos[::-1], percentages, height=0.6, color=bar_colors, alpha=0.9, edgecolor='none')

# Remove the frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# Add a subtle grid
ax.grid(True, axis='x', alpha=0.2, linestyle='--')

# Set limits for better spacing
ax.set_xlim(0, 105)
ax.set_ylim(-0.5, len(stages) - 0.5)

# Add value labels with both absolute numbers and percentages
for i, (bar, value, percentage) in enumerate(zip(bars, values, percentages)):
    # Format with commas for thousands
    label_text = f"{value:,} ({percentage:.1f}%)"
    
    # Special handling for "First Trip Completed" to ensure full text is visible
    if i == 4:  # First Trip Completed
        x_pos = 5.5  # Position text inside bar but close to the left edge
        # First clear any previous text at this position to avoid overlapping
        for txt in ax.texts:
            if txt.get_position()[1] == len(stages) - 1 - i:
                txt.remove()
        # Add new text
        ax.text(x_pos, len(stages) - 1 - i, label_text, va='center', ha='left', 
                color='white', fontweight='bold', fontsize=12, zorder=10)
    else:
        # For other bars
        x_pos = 5  # Standard inside position
        ax.text(x_pos, len(stages) - 1 - i, label_text, va='center', ha='left', 
                color='white', fontweight='bold', fontsize=12)

# Add stage labels on the left side
for i, stage in enumerate(stages):
    ax.text(-2.5, len(stages) - 1 - i, stage, va='center', ha='right', fontweight='bold', fontsize=12)

# Create title and subtitle
plt.suptitle('Driver Conversion Funnel', fontsize=20, fontweight='bold', y=0.98)
ax.set_title('Percentage of drivers completing each stage of the onboarding process', 
             fontsize=13, pad=20, loc='left', color='#555555')

# Set axis labels
ax.set_xlabel(' ', fontsize=12, labelpad=10)
ax.set_yticks([])  # Hide y-axis ticks since we have custom labels

# Add a key insight text at the bottom
both_to_trip_percentage = (6137 / 12879) * 100
key_insight_text = f"Key Insight: Of drivers who complete both BGC and vehicle addition, {both_to_trip_percentage:.1f}% go on to complete their first trip"
plt.figtext(0.5, 0.05, key_insight_text, ha='center', va='center', 
           color='#2c3e50', fontsize=13, fontweight='bold',
           bbox=dict(facecolor='#f8f9fa', edgecolor='#3498db', boxstyle='round,pad=0.6', alpha=0.9))

# Add dropoff percentages between stages
for i in range(len(stages)-1):
    dropoff = percentages[i] - percentages[i+1]
    dropoff_text = f"↓ {dropoff:.1f}%"
    
    # Position between stages
    x_pos = percentages[i+1] + (percentages[i] - percentages[i+1])/2
    y_pos = len(stages) - 1 - i - 0.5
    
    # Only add if dropoff is significant
    if dropoff > 3:
        ax.text(x_pos, y_pos, dropoff_text, va='center', ha='center', 
               color='#e74c3c', fontsize=10, fontweight='bold',
               bbox=dict(boxstyle="round,pad=0.3", facecolor='white', alpha=0.7, edgecolor='#e74c3c', linewidth=1))

# Add a subtle background color to the plot
fig.patch.set_facecolor('#000000')
ax.set_facecolor('#f9f9f9')

# Disable box around the plot completely (to remove any blue lines)
for spine in ax.spines.values():
    spine.set_visible(False)

# Ensure xticks are visible
ax.tick_params(axis='x', colors='black')

# Disable the blue line issue by explicitly setting no line style for the plot
for line in ax.get_lines():
    line.set_visible(False)

# Save the visualization with high quality
plt.savefig('fixed_funnel_text_complete.png', dpi=300, bbox_inches='tight', facecolor='#f9f9f9')
plt.close()

print("Fixed funnel visualization saved as 'fixed_funnel_text_complete.png'")

Fixed funnel visualization saved as 'fixed_funnel_text_complete.png'


In [23]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

# Set style for better visualizations
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['figure.dpi'] = 300

# Define the funnel data
stages = [
    'Total Signups',
    'BGC Completed',
    'Vehicle Added',
    'Both Steps Completed',
    'First Trip Completed'
]

values = [54681, 32896, 13134, 12879, 6137]
percentages = [100.0, 60.2, 24.0, 23.6, 11.2]

# Calculate percentage of drivers who complete first trip after completing both steps
both_to_trip_percentage = (6137 / 12879) * 100

# Create specific colors for each stage
bar_colors = ['#455a64', '#4caf50', '#26a69a', '#5c6bc0', '#9c27b0']

# Create the figure
fig, ax = plt.subplots(figsize=(14, 8))

# Create horizontal bars - reversed to show in descending order
y_pos = np.arange(len(stages))
bars = ax.barh(y_pos[::-1], percentages, height=0.6, color=bar_colors, alpha=0.9, edgecolor='white', linewidth=1)

# Add a subtle grid
ax.grid(True, axis='x', alpha=0.2, linestyle='--')

# Remove the frame and ticks from the axes
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)  # Also hide bottom spine to remove potential issues
ax.tick_params(left=False)

# Set limits for better spacing
ax.set_xlim(0, 105)  # A bit extra space for labels
ax.set_ylim(-0.5, len(stages) - 0.5)

# Add value labels with both absolute numbers and percentages - ALL INSIDE THE BARS
for i, (bar, value, percentage) in enumerate(zip(bars, values, percentages)):
    # Format with commas for thousands
    label_text = f"{value:,} ({percentage:.1f}%)"
    
    # Position the text inside the bar for all bars
    # For smaller bars, adjust position to make sure text is fully visible
    if percentage < 15 and i == 4:  # First Trip Completed
        # Center the text in the bar
        x_pos = percentage / 2
    else:
        x_pos = 5  # Standard inside position
    
    ax.text(x_pos, len(stages) - 1 - i, label_text, va='center', ha='left', 
            color='white', fontweight='bold', fontsize=12)

# Add stage labels on the left side
for i, stage in enumerate(stages):
    ax.text(-2.5, len(stages) - 1 - i, stage, va='center', ha='right', fontweight='bold', fontsize=12)

# Create title and subtitle
plt.suptitle('Driver Conversion Funnel', fontsize=20, fontweight='bold', y=0.98)
ax.set_title('Percentage of drivers completing each stage of the onboarding process', 
             fontsize=13, pad=20, loc='left', color='#555555')

# Set axis labels
ax.set_xlabel('Percentage of Initial Signups', fontsize=12, labelpad=10)
ax.set_yticks([])  # Hide y-axis ticks since we have custom labels

# Add a key insight text at the bottom
key_insight_text = f"Key Insight: Of drivers who complete both BGC and vehicle addition, {both_to_trip_percentage:.1f}% go on to complete their first trip"
plt.figtext(0.5, 0.05, key_insight_text, ha='center', va='center', 
           color='#2c3e50', fontsize=13, fontweight='bold',
           bbox=dict(facecolor='#f8f9fa', edgecolor='#3498db', boxstyle='round,pad=0.6', alpha=0.9))

# Add dropoff percentages between stages
for i in range(len(stages)-1):
    dropoff = percentages[i] - percentages[i+1]
    dropoff_text = f"↓ {dropoff:.1f}%"
    
    # Position between stages
    x_pos = percentages[i+1] + (percentages[i] - percentages[i+1])/2
    y_pos = len(stages) - 1 - i - 0.5
    
    # Only add if dropoff is significant
    if dropoff > 3:
        ax.text(x_pos, y_pos, dropoff_text, va='center', ha='center', 
               color='#e74c3c', fontsize=10, fontweight='bold',
               bbox=dict(boxstyle="round,pad=0.3", facecolor='white', alpha=0.7, edgecolor='#e74c3c', linewidth=1))

# IMPORTANT: Do not add any connecting lines - they cause the blue line problem

# Add a subtle background color to the plot
fig.patch.set_facecolor('#f9f9f9')
ax.set_facecolor('#f9f9f9')

# Save the visualization with high quality
plt.savefig('final_fixed_driver_conversion_funnel.png', dpi=300, bbox_inches='tight', facecolor='#f9f9f9')
plt.close()

print("Final fixed funnel visualization saved as 'final_fixed_driver_conversion_funnel.png'")

Final fixed funnel visualization saved as 'final_fixed_driver_conversion_funnel.png'
