In [10]:
!pip install pandas scikit-learn matplotlib seaborn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np



In [2]:
import pandas as pd
data=pd.read_csv("chicago_taxi_trips_2016_01.csv")
data.head(10)

Unnamed: 0,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,85.0,2016-1-13 06:15:00,2016-1-13 06:15:00,180.0,0.4,,,24.0,24.0,4.5,0.0,0.0,0.0,4.5,Cash,107.0,199.0,510.0,199.0,510.0
1,2776.0,2016-1-22 09:30:00,2016-1-22 09:45:00,240.0,0.7,,,,,4.45,4.45,0.0,0.0,8.9,Credit Card,,,,,
2,3168.0,2016-1-31 21:30:00,2016-1-31 21:30:00,0.0,0.0,,,,,42.75,5.0,0.0,0.0,47.75,Credit Card,119.0,,,,
3,4237.0,2016-1-23 17:30:00,2016-1-23 17:30:00,480.0,1.1,,,6.0,6.0,7.0,0.0,0.0,0.0,7.0,Cash,,686.0,500.0,686.0,500.0
4,5710.0,2016-1-14 05:45:00,2016-1-14 06:00:00,480.0,2.71,,,32.0,,10.25,0.0,0.0,0.0,10.25,Cash,,385.0,478.0,,
5,1987.0,2016-1-8 18:15:00,2016-1-8 18:45:00,1080.0,6.2,,,8.0,3.0,17.75,0.0,0.0,0.0,17.75,Cash,,599.0,346.0,660.0,120.0
6,4986.0,2016-1-14 04:30:00,2016-1-14 05:00:00,1500.0,18.4,,,,,45.0,12.0,0.0,0.0,57.0,Credit Card,,,,,
7,6400.0,2016-1-26 04:15:00,2016-1-26 04:15:00,60.0,0.2,,,16.0,16.0,3.75,0.0,0.0,0.0,3.75,Cash,107.0,527.0,24.0,527.0,24.0
8,7418.0,2016-1-22 11:30:00,2016-1-22 11:45:00,180.0,0.0,,504.0,8.0,32.0,5.0,2.0,0.0,1.5,8.5,Credit Card,82.0,210.0,470.0,744.0,605.0
9,6450.0,2016-1-7 21:15:00,2016-1-7 21:15:00,0.0,0.0,,,,,3.25,0.0,0.0,1.5,4.75,Cash,,,,,


Converting datetime and extracting starting hour and day of week, since datetime can't be handled directly by model:

In [3]:
data['trip_start_timestamp']=pd.to_datetime(data['trip_start_timestamp'])
data['hour']=data['trip_start_timestamp'].dt.hour
data['day_of_week'] = data['trip_start_timestamp'].dt.dayofweek

data.head()

Unnamed: 0,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,hour,day_of_week
0,85.0,2016-01-13 06:15:00,2016-1-13 06:15:00,180.0,0.4,,,24.0,24.0,4.5,...,0.0,4.5,Cash,107.0,199.0,510.0,199.0,510.0,6,2
1,2776.0,2016-01-22 09:30:00,2016-1-22 09:45:00,240.0,0.7,,,,,4.45,...,0.0,8.9,Credit Card,,,,,,9,4
2,3168.0,2016-01-31 21:30:00,2016-1-31 21:30:00,0.0,0.0,,,,,42.75,...,0.0,47.75,Credit Card,119.0,,,,,21,6
3,4237.0,2016-01-23 17:30:00,2016-1-23 17:30:00,480.0,1.1,,,6.0,6.0,7.0,...,0.0,7.0,Cash,,686.0,500.0,686.0,500.0,17,5
4,5710.0,2016-01-14 05:45:00,2016-1-14 06:00:00,480.0,2.71,,,32.0,,10.25,...,0.0,10.25,Cash,,385.0,478.0,,,5,3


In [4]:
data.isnull().sum()

taxi_id                        23
trip_start_timestamp            0
trip_end_timestamp            125
trip_seconds                  314
trip_miles                     14
pickup_census_tract       1705805
dropoff_census_tract       738326
pickup_community_area      285789
dropoff_community_area     313655
fare                           33
tips                           33
tolls                          33
extras                         33
trip_total                     33
payment_type                    0
company                    632726
pickup_latitude            285757
pickup_longitude           285757
dropoff_latitude           311682
dropoff_longitude          311682
hour                            0
day_of_week                     0
dtype: int64

Removing null values and including only required col:

In [5]:
data = data[['trip_miles', 'trip_seconds','hour','day_of_week', 'trip_total']]
data = data[
    (data['trip_miles'] > 0) &
    (data['trip_seconds'] > 0) &
    (data['trip_total'] > 0)
]
data.head()

Unnamed: 0,trip_miles,trip_seconds,hour,day_of_week,trip_total
0,0.4,180.0,6,2,4.5
1,0.7,240.0,9,4,8.9
3,1.1,480.0,17,5,7.0
4,2.71,480.0,5,3,10.25
5,6.2,1080.0,18,4,17.75


splitting dataset into 2 for training and testing:

In [8]:
x=data[['trip_miles','trip_seconds','hour','day_of_week']]
y=data['trip_total']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=33)

training model

In [9]:
model= LinearRegression()
model.fit(x_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


Measuring metrics and printing

In [12]:
y_pred=model.predict(x_test)
mae= mean_absolute_error(y_test,y_pred)
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test,y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

Mean Absolute Error (MAE): 6.319059931391108
Root Mean Squared Error (RMSE): 32.543616034839324
R² Score: 0.0918180221066699


A very high RMSE and A lower R^2 score shows the relationship is more complex and linear regression is underfitting and unable to understand the actual relationship. 