In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#!pip install folium
import folium

In [2]:
# Load the dataset
data = pd.read_csv('rideshare.csv')
data.head()

Unnamed: 0,id,timestamp,hour,day,month,datetime,timezone,source,destination,cab_type,...,precipIntensityMax,uvIndexTime,temperatureMin,temperatureMinTime,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime
0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,1544953000.0,9,16,12,2018-12-16 09:30:07,America/New_York,Haymarket Square,North Station,Lyft,...,0.1276,1544979600,39.89,1545012000,43.68,1544968800,33.73,1545012000,38.07,1544958000
1,4bd23055-6827-41c6-b23b-3c491f24e74d,1543284000.0,2,27,11,2018-11-27 02:00:23,America/New_York,Haymarket Square,North Station,Lyft,...,0.13,1543251600,40.49,1543233600,47.3,1543251600,36.2,1543291200,43.92,1543251600
2,981a3613-77af-4620-a42a-0c0866077d1e,1543367000.0,1,28,11,2018-11-28 01:00:22,America/New_York,Haymarket Square,North Station,Lyft,...,0.1064,1543338000,35.36,1543377600,47.55,1543320000,31.04,1543377600,44.12,1543320000
3,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,1543554000.0,4,30,11,2018-11-30 04:53:02,America/New_York,Haymarket Square,North Station,Lyft,...,0.0,1543507200,34.67,1543550400,45.03,1543510800,30.3,1543550400,38.53,1543510800
4,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,1543463000.0,3,29,11,2018-11-29 03:49:20,America/New_York,Haymarket Square,North Station,Lyft,...,0.0001,1543420800,33.1,1543402800,42.18,1543420800,29.11,1543392000,35.75,1543420800


In [3]:

# Data preprocessing
# Select relevant columns
selected_columns = ['latitude', 'longitude', 'cab_type', 'hour', 'day', 'month', 'price']
data = data[selected_columns]



In [4]:
# Handle missing values, if any
data = data.dropna()



In [5]:
# Feature engineering
# Create dummy variables for categorical features
data = pd.get_dummies(data, columns=['cab_type'])






In [6]:
# Unsupervised learning - Predict high booking areas
# Perform clustering using K-Means
X = data[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=5)  # Adjust the number of clusters as needed
data['cluster'] = kmeans.fit_predict(X)




In [7]:
# Supervised learning - Predict prices
# Split the data into training and testing sets
X = data.drop(['price'], axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict prices on the testing set
y_pred = model.predict(X_test)


In [9]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 86.54085184705495


In [None]:
# Create a map centered around a specific location
map_center = [data['latitude'].mean(), data['longitude'].mean()]
map_zoom = 12
map_osm = folium.Map(location=map_center, zoom_start=map_zoom)

# Add markers to the map for each data point
for _, row in data.iterrows():
    location = [row['latitude'], row['longitude']]
    popup = f"Price: {row['price']}"
    folium.Marker(location=location, popup=popup).add_to(map_osm)

# Display the map
map_osm.save('map.html')