In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [5]:
data_path = "/content/2017_Yellow_Taxi_Trip_Data.csv"
data = pd.read_csv(data_path)
df = pd.read_csv(data_path)

In [7]:
print("\nDataFrame Info:\n")
print(df.info())


DataFrame Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22699 entries, 0 to 22698
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             22699 non-null  int64  
 1   VendorID               22699 non-null  int64  
 2   tpep_pickup_datetime   22699 non-null  object 
 3   tpep_dropoff_datetime  22699 non-null  object 
 4   passenger_count        22699 non-null  int64  
 5   trip_distance          22699 non-null  float64
 6   RatecodeID             22699 non-null  int64  
 7   store_and_fwd_flag     22699 non-null  object 
 8   PULocationID           22699 non-null  int64  
 9   DOLocationID           22699 non-null  int64  
 10  payment_type           22699 non-null  int64  
 11  fare_amount            22699 non-null  float64
 12  extra                  22699 non-null  float64
 13  mta_tax                22699 non-null  float64
 14  tip_amount             22699 non-nul

In [8]:
print("\nNull Values:\n")
print(df.isnull().sum())


Null Values:

Unnamed: 0               0
VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
dtype: int64


In [9]:
print("\nDataFrame Describe:\n")
print(df.describe())


DataFrame Describe:

         Unnamed: 0      VendorID  passenger_count  trip_distance  \
count  2.269900e+04  22699.000000     22699.000000   22699.000000   
mean   5.675849e+07      1.556236         1.642319       2.913313   
std    3.274493e+07      0.496838         1.285231       3.653171   
min    1.212700e+04      1.000000         0.000000       0.000000   
25%    2.852056e+07      1.000000         1.000000       0.990000   
50%    5.673150e+07      2.000000         1.000000       1.610000   
75%    8.537452e+07      2.000000         2.000000       3.060000   
max    1.134863e+08      2.000000         6.000000      33.960000   

         RatecodeID  PULocationID  DOLocationID  payment_type   fare_amount  \
count  22699.000000  22699.000000  22699.000000  22699.000000  22699.000000   
mean       1.043394    162.412353    161.527997      1.336887     13.026629   
std        0.708391     66.633373     70.139691      0.496211     13.243791   
min        1.000000      1.000000      1

In [11]:
sorted_trip_distance = df.sort_values(by="trip_distance", ascending=False)
print("\nSorted Trip Distance:\n")
print(sorted_trip_distance["trip_distance"].head(10))


Sorted Trip Distance:

9280     33.96
13861    33.92
6064     32.72
10291    31.95
29       30.83
18130    30.50
5792     30.33
15350    28.23
10302    28.20
2592     27.97
Name: trip_distance, dtype: float64


In [14]:
sorted_total_amount = df.sort_values(by="total_amount", ascending=False) # Changed column name to 'total_amount'
print("\nSorted Total Amount:\n")
print(sorted_total_amount["total_amount"].head(10)) # Changed column name here as well


Sorted Total Amount:

8476     1200.29
20312     450.30
13861     258.21
12511     233.74
15474     211.80
6064      179.06
16379     157.06
3582      152.30
11269     151.82
9280      150.30
Name: total_amount, dtype: float64


In [15]:
print("\nTop Rows for Trip Distance and Total Amount:\n")
print(sorted_trip_distance.head(5))
print(sorted_total_amount.head(5))



Top Rows for Trip Distance and Total Amount:

       Unnamed: 0  VendorID    tpep_pickup_datetime   tpep_dropoff_datetime  \
9280     51810714         2  06/18/2017 11:33:25 PM  06/19/2017 12:12:38 AM   
13861    40523668         2   05/19/2017 8:20:21 AM   05/19/2017 9:20:30 AM   
6064     49894023         2  06/13/2017 12:30:22 PM   06/13/2017 1:37:51 PM   
10291    76319330         2  09/11/2017 11:41:04 AM  09/11/2017 12:18:58 PM   
29       94052446         2   11/06/2017 8:30:50 PM  11/07/2017 12:00:00 AM   

       passenger_count  trip_distance  RatecodeID store_and_fwd_flag  \
9280                 2          33.96           5                  N   
13861                1          33.92           5                  N   
6064                 1          32.72           3                  N   
10291                1          31.95           4                  N   
29                   1          30.83           1                  N   

       PULocationID  DOLocationID  payment_ty

In [19]:
features = ["passenger_count", "trip_distance", "RatecodeID", "payment_type", "extra", "mta_tax", "improvement_surcharge", "tolls_amount"]  # Changed to lowercase
target = "total_amount"

df = df.dropna(subset=features + [target])

X = df[features]
y = df[target]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)


In [22]:
y_pred = model.predict(X_test)

In [23]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [24]:
print("\nModel Evaluation:\n")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Accuracy: {r2 * 100:.2f}%")



Model Evaluation:

Mean Squared Error: 308.75499543142644
R-squared: 0.3619410504266385
Accuracy: 36.19%
