In [None]:
# To run this code u have to download this data set in your local system from this link: "https://www.kaggle.com/datasets/brllrb/uber-and-lyft-dataset-boston-ma"

In [66]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import folium

try:
    # Step 1: Load the dataset
    df = pd.read_csv('rideshare_kaggle.csv')

    # Step 2: Preprocess the data
    # Handle missing values
    df['price'].fillna(df['price'].mean(), inplace=True)  # Replace null values with the mean

    # Encode categorical variables
    cat_cols = ['source', 'cab_type', 'product_id']
    for col in cat_cols:
        df[col] = df[col].astype('category').cat.codes

    # Scale numerical features
    weather_cols = ['temperature', 'apparentTemperature', 'precipIntensity', 'humidity', 'windSpeed', 'cloudCover']
    time_cols = ['hour', 'day', 'month']
    num_cols = ['price', 'distance', 'latitude', 'longitude'] + weather_cols + time_cols
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    # Step 3: Unsupervised algorithm for high booking area (K-means clustering)
    # Select relevant features for clustering
    features = ['distance', 'latitude', 'longitude'] + time_cols

    # Perform K-means clustering
    k = 3  # Number of clusters
    kmeans = KMeans(n_clusters=k, random_state=42)
    df['Cluster'] = kmeans.fit_predict(df[features])

    # Step 4: Supervised algorithm for price prediction (Random Forest Regression)
    # Select features and target variable
    X = df[['distance', 'latitude', 'longitude'] + weather_cols + time_cols].copy()
    X.loc[:, 'Cluster'] = df['Cluster'].astype('category')  # Convert 'Cluster' to categorical type
    y = df['price']

    # Initialize the Random Forest Regression model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)

    # Fit the model
    rf.fit(X, y)

    # Calculate the average distance
    avg_distance = df['distance'].mean()

    # Step 5: Map function to display data

    # Create a map object
    map_obj = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=10)

    # Add markers for high booking areas
    for cluster_label in range(k):
        cluster_data = df[df['Cluster'] == cluster_label]
        cluster_lat = cluster_data['latitude'].mean()
        cluster_lon = cluster_data['longitude'].mean()
        avg_distance = cluster_data['distance'].mean()
        avg_weather = cluster_data[weather_cols].mean().values
        feature_values = [avg_distance, cluster_lat, cluster_lon, 9, 16, 12] + list(avg_weather) + [cluster_label]
        predicted_price = rf.predict([feature_values])[0]  # Convert predicted_price to a scalar value
        folium.Marker([cluster_lat, cluster_lon], popup=f'Cluster {cluster_label}, price: {predicted_price:.2f}').add_to(map_obj)

    # Display the map
    map_obj.save('map.html')

except Exception as e:
    print("An error occurred:", str(e))