In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("uber_rides_data.xlsx - sample_train.csv")
df

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695416,1


In [3]:
df.shape

(200000, 8)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


In [5]:
df['dropoff_longitude'].isnull().sum()

1

In [6]:
df.dropna(inplace=True)

In [7]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [8]:
df['fare_amount'].mean()

11.359891549458371

In [9]:

def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the Haversine distance between two sets of coordinates (lat1, lon1) and (lat2, lon2).
    """
    # Radius of the Earth in kilometers
    R = 6371
    
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    # Calculate the distance
    distance = R * c
    
    return distance

# Calculate the Haversine distance for each row in the DataFrame
df['haversine_distance'] = df.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

# Calculate the median Haversine distance
median_distance = df['haversine_distance'].median()

print("Median Haversine Distance:", median_distance)


Median Haversine Distance: 2.1209923961833708


In [10]:
# Calculate the maximum Haversine distance
max_distance = df['haversine_distance'].max()

print("Maximum Haversine Distance:", max_distance)


Maximum Haversine Distance: 16409.239135313164


In [11]:
# Count the number of rides with 0.0 Haversine distance
zero_distance_rides = df[df['haversine_distance'] == 0.0]
count_zero_distance_rides = len(zero_distance_rides)

print("Number of rides with 0.0 Haversine distance:", count_zero_distance_rides)


Number of rides with 0.0 Haversine distance: 5632


In [12]:
# Calculate the mean 'fare_amount' for rides with 0.0 Haversine distance
mean_fare_for_zero_distance_rides = zero_distance_rides['fare_amount'].mean()

print("Mean 'fare_amount' for rides with 0.0 Haversine distance:", mean_fare_for_zero_distance_rides)


Mean 'fare_amount' for rides with 0.0 Haversine distance: 11.585317826704578


In [13]:
# Calculate the maximum 'fare_amount'
max_fare_amount = df['fare_amount'].max()

print("Maximum 'fare_amount' for a ride:", max_fare_amount)


Maximum 'fare_amount' for a ride: 499.0


In [14]:
# Find the row with the maximum 'fare_amount'
costliest_ride = df[df['fare_amount'] == max_fare_amount].iloc[0]

# Calculate the Haversine distance for the costliest ride
haversine_distance_costliest_ride = haversine(
    costliest_ride['pickup_latitude'],
    costliest_ride['pickup_longitude'],
    costliest_ride['dropoff_latitude'],
    costliest_ride['dropoff_longitude']
)

print("Haversine distance for the costliest ride:", haversine_distance_costliest_ride)


Haversine distance for the costliest ride: 0.0007899213191009994


In [15]:
# Convert 'pickup_datetime' column to datetime type if not already done
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Extract the year from the 'pickup_datetime' column and count the rides in 2014
rides_in_2014 = df[df['pickup_datetime'].dt.year == 2014]
count_rides_in_2014 = len(rides_in_2014)

print("Number of rides recorded in the year 2014:", count_rides_in_2014)


Number of rides recorded in the year 2014: 29968


In [16]:
# Convert 'pickup_datetime' column to datetime type if not already done
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Extract the year and quarter from the 'pickup_datetime' column and count the rides in the first quarter of 2014
rides_in_first_quarter_2014 = df[(df['pickup_datetime'].dt.year == 2014) & (df['pickup_datetime'].dt.quarter == 1)]
count_rides_in_first_quarter_2014 = len(rides_in_first_quarter_2014)

print("Number of rides recorded in the first quarter of 2014:", count_rides_in_first_quarter_2014)


Number of rides recorded in the first quarter of 2014: 7687


In [17]:
# Convert 'pickup_datetime' column to datetime type if not already done
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Filter the DataFrame to include only rides in September 2010
rides_in_september_2010 = df[(df['pickup_datetime'].dt.year == 2010) & (df['pickup_datetime'].dt.month == 9)]

# Calculate the day of the week and count the rides for each day
rides_in_september_2010['day_of_week'] = rides_in_september_2010['pickup_datetime'].dt.dayofweek
rides_count_by_day = rides_in_september_2010['day_of_week'].value_counts()

# Find the day with the maximum recorded rides
max_rides_day = rides_count_by_day.idxmax()

# Convert the day of the week index to the actual day name
days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
max_rides_day_name = days_of_week[max_rides_day]

print("Day of the week in September 2010 with the maximum recorded rides:", max_rides_day_name)


Day of the week in September 2010 with the maximum recorded rides: Thursday


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_in_september_2010['day_of_week'] = rides_in_september_2010['pickup_datetime'].dt.dayofweek


In [19]:
df

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_distance
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683323
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1,2.457590
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1,5.036377
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.661683
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,4.475450
...,...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00+00:00,-73.987042,40.739367,-73.986525,40.740297,1,0.112210
199996,16382965,7.5,2014-03-14 01:09:00+00:00,-73.984722,40.736837,-74.006672,40.739620,1,1.875050
199997,27804658,30.9,2009-06-29 00:42:00+00:00,-73.986017,40.756487,-73.858957,40.692588,2,12.850319
199998,20259894,14.5,2015-05-20 14:56:25+00:00,-73.997124,40.725452,-73.983215,40.695416,1,3.539715


In [20]:
# Convert 'pickup_datetime' column to datetime type if not already done
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Extract the day of the week and add it as a new column 'ride_week_day'
df['ride_week_day'] = df['pickup_datetime'].dt.dayofweek

    ride_id  fare_amount           pickup_datetime  pickup_longitude  \
0  24238194          7.5 2015-05-07 19:52:06+00:00        -73.999817   
1  27835199          7.7 2009-07-17 20:04:56+00:00        -73.994355   
2  44984355         12.9 2009-08-24 21:45:00+00:00        -74.005043   
3  25894730          5.3 2009-06-26 08:22:21+00:00        -73.976124   
4  17610152         16.0 2014-08-28 17:47:00+00:00        -73.925023   

   pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  \
0        40.738354         -73.999512         40.723217                1   
1        40.728225         -73.994710         40.750325                1   
2        40.740770         -73.962565         40.772647                1   
3        40.790844         -73.965316         40.803349                3   
4        40.744085         -73.973082         40.761247                5   

   haversine_distance  ride_week_day  
0            1.683323              3  
1            2.457590           

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

# Assuming you have a DataFrame df with columns: 'passenger_count', 'distance', 'ride_week_day', and 'fare_amount'

# Features (X) and target (y)
X = df[['passenger_count', 'haversine_distance', 'ride_week_day']]
y = df['fare_amount']

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)
linear_reg_predictions = linear_reg_model.predict(X_test)
linear_reg_r_squared = r2_score(y_test, linear_reg_predictions)

# Decision Tree Regressor
decision_tree_model = DecisionTreeRegressor(random_state=42)
decision_tree_model.fit(X_train, y_train)
decision_tree_predictions = decision_tree_model.predict(X_test)
decision_tree_r_squared = r2_score(y_test, decision_tree_predictions)

# Random Forest Regressor
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)
random_forest_predictions = random_forest_model.predict(X_test)
random_forest_r_squared = r2_score(y_test, random_forest_predictions)

# k-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)
knn_r_squared = r2_score(y_test, knn_predictions)

print("R-squared for Linear Regression:", linear_reg_r_squared)
print("R-squared for Decision Tree Regressor:", decision_tree_r_squared)
print("R-squared for Random Forest Regressor:", random_forest_r_squared)
print("R-squared for k-Nearest Neighbors Regressor:", knn_r_squared)


R-squared for Linear Regression: 0.0004341139144697914
R-squared for Decision Tree Regressor: 0.47311852558447887
R-squared for Random Forest Regressor: 0.6292552491098994
R-squared for k-Nearest Neighbors Regressor: 0.6321500569245931


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

# Assuming you have a DataFrame df with columns: 'passenger_count', 'distance', 'ride_week_day', and 'fare_amount'

# Features (X) and target (y)
X = df[['passenger_count', 'haversine_distance', 'ride_week_day']]
y = df['fare_amount']

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize lists to store adjusted R-squared values
adjusted_r_squared_values = []

# Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)
linear_reg_predictions = linear_reg_model.predict(X_test)
linear_reg_r_squared = r2_score(y_test, linear_reg_predictions)

# Calculate adjusted R-squared for Linear Regression
n = len(y_test)
p = X_test.shape[1]
adjusted_r_squared_linear = 1 - (1 - linear_reg_r_squared) * (n - 1) / (n - p - 1)
adjusted_r_squared_values.append(adjusted_r_squared_linear)

# Decision Tree Regressor
decision_tree_model = DecisionTreeRegressor(random_state=42)
decision_tree_model.fit(X_train, y_train)
decision_tree_predictions = decision_tree_model.predict(X_test)
decision_tree_r_squared = r2_score(y_test, decision_tree_predictions)

# Calculate adjusted R-squared for Decision Tree Regressor
n = len(y_test)
p = X_test.shape[1]
adjusted_r_squared_decision_tree = 1 - (1 - decision_tree_r_squared) * (n - 1) / (n - p - 1)
adjusted_r_squared_values.append(adjusted_r_squared_decision_tree)

# Random Forest Regressor
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)
random_forest_predictions = random_forest_model.predict(X_test)
random_forest_r_squared = r2_score(y_test, random_forest_predictions)

# Calculate adjusted R-squared for Random Forest Regressor
n = len(y_test)
p = X_test.shape[1]
adjusted_r_squared_random_forest = 1 - (1 - random_forest_r_squared) * (n - 1) / (n - p - 1)
adjusted_r_squared_values.append(adjusted_r_squared_random_forest)

# k-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)
knn_r_squared = r2_score(y_test, knn_predictions)

# Calculate adjusted R-squared for k-Nearest Neighbors Regressor
n = len(y_test)
p = X_test.shape[1]
adjusted_r_squared_knn = 1 - (1 - knn_r_squared) * (n - 1) / (n - p - 1)
adjusted_r_squared_values.append(adjusted_r_squared_knn)

# Create a dictionary to map model names to adjusted R-squared values
model_names = {
    'Linear Regression': adjusted_r_squared_linear,
    'Decision Tree Regressor': adjusted_r_squared_decision_tree,
    'Random Forest Regressor': adjusted_r_squared_random_forest,
    'k-Nearest Neighbors Regressor': adjusted_r_squared_knn
}

# Find the model with the least adjusted R-squared value
min_adjusted_r_squared_model = min(model_names, key=model_names.get)
min_adjusted_r_squared_value = model_names[min_adjusted_r_squared_model]

# Display the adjusted R-squared values and the model with the least value
for model, adjusted_r_squared in model_names.items():
    print(f"Adjusted R-squared for {model}: {adjusted_r_squared:.4f}")

print(f"The model with the least adjusted R-squared value is: {min_adjusted_r_squared_model} ({min_adjusted_r_squared_value:.4f})")

Adjusted R-squared for Linear Regression: 0.0004
Adjusted R-squared for Decision Tree Regressor: 0.4731
Adjusted R-squared for Random Forest Regressor: 0.6292
Adjusted R-squared for k-Nearest Neighbors Regressor: 0.6321
The model with the least adjusted R-squared value is: Linear Regression (0.0004)
