In [1]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_excel('uber_rides_data.xlsx')

In [4]:
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [5]:
df.columns

Index(['ride_id', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [8]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [10]:
df.shape

(200000, 8)

In [28]:
df.dtypes

ride_id                int64
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [32]:
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'])


In [33]:
df.dtypes

ride_id                            int64
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object

In [34]:
cleaned_df=df.dropna()

In [36]:
cleaned_df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [37]:
cleaned_df.columns

Index(['ride_id', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [38]:
cleaned_df['fare_amount'].mean()

11.359891549458371

In [39]:
cleaned_df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5


In [40]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    # Radius of Earth in kilometers (mean value)
    r = 6371.0
    
    # Calculate the distance
    distance = c * r
    
    return distance


In [41]:
cleaned_df['distance_km'] = cleaned_df.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['distance_km'] = cleaned_df.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)


In [42]:
cleaned_df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_km
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683323
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2.45759
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,5.036377
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.661683
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,4.47545


In [43]:
cleaned_df['distance_km'].median()

2.120992396182902

In [44]:
cleaned_df['distance_km'].max()

16409.239135313168

In [45]:
# Count rides with a Haversine distance of 0.0
zero_distance_rides = cleaned_df[cleaned_df['distance_km'] == 0.0]

# Get the count of such rides
count_zero_distance_rides = len(zero_distance_rides)

print("Number of rides with 0.0 Haversine distance:", count_zero_distance_rides)

Number of rides with 0.0 Haversine distance: 5632


In [47]:
mean_fare_amount_zero_distance = zero_distance_rides['fare_amount'].mean()

In [48]:
print("Mean 'fare_amount' for rides with 0.0 Haversine distance:", mean_fare_amount_zero_distance)

Mean 'fare_amount' for rides with 0.0 Haversine distance: 11.585317826704578


If the distance is zero it means that there has been no ride done by the uber drivers which means that fare should be zero.But the dataset is showing fare amount mean as 11.5 usd for 0 distance covered.Possible reasons could be-
1.incorrect fare entries by the drivers
2.outliers in dataset
3.incomplete data set
4.drivers trying to cheat uber

In [50]:
cleaned_df['fare_amount'].max()

499.0

In [51]:
# Find the row with the highest 'fare_amount'
costliest_ride = cleaned_df[cleaned_df['fare_amount'] == cleaned_df['fare_amount'].max()]

# Retrieve the Haversine distance for the costliest ride
distance_costliest_ride = costliest_ride['distance_km'].values[0]

print("Haversine distance for the costliest ride:", distance_costliest_ride, "km")

Haversine distance for the costliest ride: 0.0007899213191009994 km


In the real world scenario its not possible to charge 499 for 0.000078 km.But in the data set the above case happened.It may happen beacause of the following possibilities
1.Wrong fare entries
2.outliers
3.incomplete data
its a very imp issue which needs to be resolved

In [52]:
# Extract the year from the 'pickup_datetime' column
cleaned_df['year'] = cleaned_df['pickup_datetime'].dt.year

# Count the number of rides in the year 2014
rides_2014 = len(cleaned_df[cleaned_df['year'] == 2014])

print("Number of rides recorded in the year 2014:", rides_2014)

Number of rides recorded in the year 2014: 29968


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['year'] = cleaned_df['pickup_datetime'].dt.year


In [54]:
# Extract the quarter from the 'pickup_datetime' column
cleaned_df['quarter'] = cleaned_df['pickup_datetime'].dt.quarter

# Count the number of rides in the first quarter of 2014
rides_first_quarter_2014 = len(cleaned_df[(cleaned_df['year'] == 2014) & (cleaned_df['quarter'] == 1)])

print("Number of rides recorded in the first quarter of 2014:", rides_first_quarter_2014)

Number of rides recorded in the first quarter of 2014: 7687


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['quarter'] = cleaned_df['pickup_datetime'].dt.quarter


In [56]:
# Create a new column 'ride_week_day' to store the day of the week
cleaned_df['ride_week_day'] = cleaned_df['pickup_datetime'].dt.day_name()

# Display the DataFrame with the new column
print(cleaned_df[['pickup_datetime', 'ride_week_day']])

                 pickup_datetime ride_week_day
0      2015-05-07 19:52:06+00:00      Thursday
1      2009-07-17 20:04:56+00:00        Friday
2      2009-08-24 21:45:00+00:00        Monday
3      2009-06-26 08:22:21+00:00        Friday
4      2014-08-28 17:47:00+00:00      Thursday
...                          ...           ...
199995 2012-10-28 10:49:00+00:00        Sunday
199996 2014-03-14 01:09:00+00:00        Friday
199997 2009-06-29 00:42:00+00:00        Monday
199998 2015-05-20 14:56:25+00:00     Wednesday
199999 2010-05-15 04:08:00+00:00      Saturday

[199999 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['ride_week_day'] = cleaned_df['pickup_datetime'].dt.day_name()


In [57]:
cleaned_df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_km,year,quarter,day_of_week,ride_week_day
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683323,2015,2,Thursday,Thursday
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2.45759,2009,3,Friday,Friday
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,5.036377,2009,3,Monday,Monday
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.661683,2009,2,Friday,Friday
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,4.47545,2014,3,Thursday,Thursday


In [55]:
# Extract the day of the week from the 'pickup_datetime' column and create a new column
cleaned_df['day_of_week'] = cleaned_df['pickup_datetime'].dt.day_name()

# Filter the data for September 2010
september_2010_rides = cleaned_df[(cleaned_df['pickup_datetime'].dt.year == 2010) & (cleaned_df['pickup_datetime'].dt.month == 9)]

# Find the day of the week with the maximum rides
max_rides_day = september_2010_rides['day_of_week'].value_counts().idxmax()

print("Day of the week in September 2010 with the maximum rides:", max_rides_day)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['day_of_week'] = cleaned_df['pickup_datetime'].dt.day_name()


Day of the week in September 2010 with the maximum rides: Thursday


# ML SPRINT 2

In [58]:
#inputs and outputs
X=cleaned_df[['passenger_count','distance_km','ride_week_day']]
y=cleaned_df['fare_amount']

In [59]:
X.head()

Unnamed: 0,passenger_count,distance_km,ride_week_day
0,1,1.683323,Thursday
1,1,2.45759,Friday
2,1,5.036377,Monday
3,3,1.661683,Friday
4,5,4.47545,Thursday


In [60]:
y.head()

0     7.5
1     7.7
2    12.9
3     5.3
4    16.0
Name: fare_amount, dtype: float64

In [62]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 15)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(139999, 3) (139999,)
(60000, 3) (60000,)


In [63]:
X_train.head()

Unnamed: 0,passenger_count,distance_km,ride_week_day
79822,1,2.017276,Friday
168562,1,0.100907,Friday
41499,1,0.693193,Monday
55964,1,1.041316,Friday
70100,5,3.496032,Thursday


In [64]:
X_train.dtypes

passenger_count      int64
distance_km        float64
ride_week_day       object
dtype: object

In [65]:
X_train_cat = X_train.select_dtypes(include=['object'])

X_train_cat.head()

Unnamed: 0,ride_week_day
79822,Friday
168562,Friday
41499,Monday
55964,Friday
70100,Thursday


In [66]:
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])

X_train_num.head()

Unnamed: 0,passenger_count,distance_km
79822,1,2.017276
168562,1,0.100907
41499,1,0.693193
55964,1,1.041316
70100,5,3.496032


In [67]:
#scaling numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_num_rescaled = pd.DataFrame(scaler.fit_transform(X_train_num), 
                                    columns = X_train_num.columns, 
                                    index = X_train_num.index)

X_train_num_rescaled.head()

Unnamed: 0,passenger_count,distance_km
79822,-0.483522,-0.047917
168562,-0.483522,-0.053056
41499,-0.483522,-0.051468
55964,-0.483522,-0.050534
70100,2.334389,-0.043951


In [76]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse=False)
X_train_cat_ohe = pd.DataFrame(encoder.fit_transform(X_train_cat), 
                               columns=encoder.get_feature_names_out(X_train_cat.columns), 
                               index = X_train_cat.index)

X_train_cat_ohe.head()

Unnamed: 0,ride_week_day_Monday,ride_week_day_Saturday,ride_week_day_Sunday,ride_week_day_Thursday,ride_week_day_Tuesday,ride_week_day_Wednesday
79822,0.0,0.0,0.0,0.0,0.0,0.0
168562,0.0,0.0,0.0,0.0,0.0,0.0
41499,1.0,0.0,0.0,0.0,0.0,0.0
55964,0.0,0.0,0.0,0.0,0.0,0.0
70100,0.0,0.0,0.0,1.0,0.0,0.0


In [77]:
X_train_transformed = pd.concat([X_train_num_rescaled, X_train_cat_ohe], axis=1)

X_train_transformed.head()

Unnamed: 0,passenger_count,distance_km,ride_week_day_Monday,ride_week_day_Saturday,ride_week_day_Sunday,ride_week_day_Thursday,ride_week_day_Tuesday,ride_week_day_Wednesday
79822,-0.483522,-0.047917,0.0,0.0,0.0,0.0,0.0,0.0
168562,-0.483522,-0.053056,0.0,0.0,0.0,0.0,0.0,0.0
41499,-0.483522,-0.051468,1.0,0.0,0.0,0.0,0.0,0.0
55964,-0.483522,-0.050534,0.0,0.0,0.0,0.0,0.0,0.0
70100,2.334389,-0.043951,0.0,0.0,0.0,1.0,0.0,0.0


In [70]:
#preparing test data

In [71]:
X_test.head()

Unnamed: 0,passenger_count,distance_km,ride_week_day
186250,5,0.601638,Friday
117887,1,0.443328,Monday
57591,1,1.371128,Thursday
196739,1,2.592268,Saturday
139627,1,1.46044,Saturday


In [72]:
X_test_cat = X_test.select_dtypes(include=['object'])

X_test_cat.head()

Unnamed: 0,ride_week_day
186250,Friday
117887,Monday
57591,Thursday
196739,Saturday
139627,Saturday


In [73]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

X_test_num.head()

Unnamed: 0,passenger_count,distance_km
186250,5,0.601638
117887,1,0.443328
57591,1,1.371128
196739,1,2.592268
139627,1,1.46044


In [74]:
X_test_num_rescaled = pd.DataFrame(scaler.transform(X_test_num), 
                                   columns = X_test_num.columns, 
                                   index = X_test_num.index)

X_test_num_rescaled.head()

Unnamed: 0,passenger_count,distance_km
186250,2.334389,-0.051713
117887,-0.483522,-0.052138
57591,-0.483522,-0.04965
196739,-0.483522,-0.046375
139627,-0.483522,-0.04941


In [112]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse=False)
encoder.fit(X_train_cat)

# Transform test data using the same encoder
X_test_cat_ohe = encoder.transform(X_test_cat)

# Create a DataFrame with one-hot encoded columns
X_test_cat_ohe = pd.DataFrame(X_test_cat_ohe, 
                                  columns=encoder.get_feature_names_out(X_test_cat.columns), 
                                  index=X_test_cat.index)

In [113]:
X_test_cat_ohe.head()

Unnamed: 0,ride_week_day_Monday,ride_week_day_Saturday,ride_week_day_Sunday,ride_week_day_Thursday,ride_week_day_Tuesday,ride_week_day_Wednesday
186250,0.0,0.0,0.0,0.0,0.0,0.0
117887,1.0,0.0,0.0,0.0,0.0,0.0
57591,0.0,0.0,0.0,1.0,0.0,0.0
196739,0.0,1.0,0.0,0.0,0.0,0.0
139627,0.0,1.0,0.0,0.0,0.0,0.0


In [114]:
X_test_transformed = pd.concat([X_test_num_rescaled, X_test_cat_ohe], axis=1)

X_test_transformed.head()

Unnamed: 0,passenger_count,distance_km,ride_week_day_Monday,ride_week_day_Saturday,ride_week_day_Sunday,ride_week_day_Thursday,ride_week_day_Tuesday,ride_week_day_Wednesday
186250,2.334389,-0.051713,0.0,0.0,0.0,0.0,0.0,0.0
117887,-0.483522,-0.052138,1.0,0.0,0.0,0.0,0.0,0.0
57591,-0.483522,-0.04965,0.0,0.0,0.0,1.0,0.0,0.0
196739,-0.483522,-0.046375,0.0,1.0,0.0,0.0,0.0,0.0
139627,-0.483522,-0.04941,0.0,1.0,0.0,0.0,0.0,0.0


# Linear Regression

In [115]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train_transformed, y_train)

LinearRegression()

In [116]:
y_train_pred = regressor.predict(X_train_transformed)

In [117]:
from sklearn import metrics
print(metrics.r2_score(y_train, y_train_pred))

0.0012965528237314228


In [118]:
r2 = metrics.r2_score(y_train, y_train_pred)
n = len(y_test)
k = X_test.shape[1]

r2_adj = 1 - (1-r2)*(n-1)/(n-k-1)

print(r2_adj)

0.001246614322139239


In [119]:
y_test_pred = regressor.predict(X_test_transformed)

In [120]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
186250,4.1,11.650724
117887,4.5,11.355589
57591,8.0,11.401764
196739,11.7,10.958645
139627,5.7,10.957859


In [121]:
print(metrics.r2_score(y_test, y_test_pred))

0.0010397982046492782


In [128]:
r2 = metrics.r2_score(y_test, y_test_pred)
n = len(y_test)
k = X_test.shape[1]

r2_adj = 1 - (1-r2)*(n-1)/(n-k-1)

print(r2_adj)

0.000989846864470234


# Random Forest 

In [122]:
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor()
regressor_rf.fit(X_train_transformed, y_train)

RandomForestRegressor()

In [129]:
y_train_pred = regressor.predict(X_train_transformed)

In [130]:
from sklearn import metrics
print(metrics.r2_score(y_train, y_train_pred))

0.0012965528237314228


In [131]:
r2 = metrics.r2_score(y_train, y_train_pred)
n = len(y_test)
k = X_test.shape[1]

r2_adj = 1 - (1-r2)*(n-1)/(n-k-1)

print(r2_adj)

0.001246614322139239


In [132]:
y_test_pred = regressor.predict(X_test_transformed)

In [133]:
print(metrics.r2_score(y_test, y_test_pred))

0.0010397982046492782


In [134]:
r2 = metrics.r2_score(y_train, y_train_pred)
n = len(y_test)
k = X_test.shape[1]

r2_adj = 1 - (1-r2)*(n-1)/(n-k-1)

print(r2_adj)

0.001246614322139239


# Decision tree

In [136]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train_transformed, y_train)

DecisionTreeRegressor()

In [137]:
y_train_pred = regressor.predict(X_train_transformed)

In [138]:
print(metrics.r2_score(y_train, y_train_pred))

0.9280622144246233


In [139]:
r2 = metrics.r2_score(y_test, y_test_pred)
n = len(y_test)
k = X_test.shape[1]

r2_adj = 1 - (1-r2)*(n-1)/(n-k-1)

print(r2_adj)

0.000989846864470234
