In [97]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [98]:
R = 6371
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df=df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

In [99]:
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1;
    dlon = lon2 - lon1;
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2*np.arcsin(np.sqrt(a))
    return R*c

In [100]:
def evaluate_model(y_test, y_pred, name):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"***{name} Results ***")
    print(f"R2 Score: {r2:.4f}")
    print(f"RMSE: {rmse: .4f}")
    return r2, rmse

In [101]:
df = pd.read_csv("C:/Users/Sandhya/Downloads/archive/uber.csv")
df

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


In [102]:
df.shape

(200000, 9)

In [103]:
df.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [105]:
df.isna().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [106]:
df = df.dropna()

In [107]:
df.isna().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [111]:
#data cleaning
#df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df.loc[:, 'pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce', infer_datetime_format=True)



  df.loc[:, 'pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce', infer_datetime_format=True)


In [112]:
# return distance value for each row i.e dis between pickup and dropoff
df['distance'] = haversine(
    df['pickup_latitude'], df['pickup_longitude'],
    df['dropoff_latitude'], df['dropoff_longitude']
)

# calculate hour from pickup datetime
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['distance'] = haversine(


AttributeError: Can only use .dt accessor with datetimelike values

In [113]:
print(df['pickup_datetime'].dtypes)


object


In [114]:
df = handle_outliers(df, 'fare_amount')
df = handle_outliers(df, 'distance')
df = handle_outliers(df, 'passenger_count')

df = df[df['fare_amount'] > 0 ]
df = df[df['distance'] > 0]
print(f"Final cleaned Dataset Shape:{df.shape}\n")

Final cleaned Dataset Shape:(150327, 10)



In [115]:
# find correlation
feature_for_corr = ['fare_amount', 'distance', 'passenger_count', 'hour', 'day_of_week']
corr = df[feature_for_corr].corr()

plt.figure(figsize = (8,6))
sns.heatmap(corr, annot= True, cmap='coolwarm', fmt=".2f")
plt.title("correlation matrix")
plt.savefig('correlation_matrix.png')
print("Correlation Matrix saved to 'correlation_matrix.png")

KeyError: "['hour', 'day_of_week'] not in index"

In [92]:
X = df[['distance', 'passenger_count', 'hour', 'day_of_week']]
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

KeyError: "['hour', 'day_of_week'] not in index"

In [93]:
 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [116]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
evaluate_model(y_test, y_pred_lr, "Linear Regression")

***Linear Regression Results ***
R2 Score: 0.6300
RMSE:  2.2603


(0.6299770635633941, 2.2602824966609547)

In [117]:
# B. Random Forest Regression
# Using n_jobs=-1 to speed up training
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15)
rf.fit(X_train, y_train) 
y_pred_rf = rf.predict(X_test)
evaluate_model(y_test, y_pred_rf, "Random Forest Regression")


***Random Forest Regression Results ***
R2 Score: 0.6377
RMSE:  2.2367


(0.637673394848721, 2.2366524845831472)