In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#Load the dataset
file_path = r'C:\Users\Sowjanniya\OneDrive\Desktop\uber-sowjanya.csv'
df = pd.read_csv(file_path)

#Data cleaning
df = df.drop(columns=['Unnamed: 0','key'])
df = df.dropna()
#Drop rows with missing values

#Feature Engineering - To convert pickup_datetime to datetime and extract useful features
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_year'] = df['pickup_datetime'].dt.year

#Drop the orginal pickup_datetime column as it's no longer needed
df = df.drop(columns=['pickup_datetime'])

#Additional Feature Engineering- To Compute the distance between pickup and dropoff points
def harversine(lat1, lon1, lat2, lon2):
   R = 6371.0 #radius of Eaeth in kilometers
   lat1, lon1, lat2, lon2 = map(np.radians,[lat1,lon1, lat2, lon2])
   dlat = lat2 - lat1
   dlon = lon2 - lon1
   a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
   return R * 2 * np.arcsin(np.sqrt(a))

#Apply heversine formula to calculate distance
def haversine(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude):
    df['distance_km'] = haversine(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])

#Define features(X) and target(y)
X = df.drop(columns=['fare_amount'])
y = df['fare_amount']

#split the dataset in to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Task 1 - Train a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

#Make predictions on the test set
y_pred_linear = linear_model.predict(X_test)

#Task 2: Evaluate the Linear Regression Model
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("Linear Regression Model Evaluation:")
print("MSE:", mse_linear)
print("RMSE:", rmse_linear)
print("R2:", r2_linear)

#Task 3:Fine -Tune the model by using a more complex model, eg., Randon Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

#Define hyperparameter grid for tuning
param_grid = {
    'n_estimators':[50,100],
    'max_depth':[None, 10,20],
    'min_samples_split':[2,5]
}


#Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

#Best model from grid search
best_rf_model=grid_search.best_estimator_

#Evaluate the tuned Random Forest model on the test set
y_pred_rf = best_rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Regressor Model Evaluation (After Tuning):")
print("Best Parameters:", grid_search.best_params_)
print("MSE:", mse_rf)
print("RMSE:", rmse_rf)
print("R2:", r2_rf)





     










      



Linear Regression Model Evaluation:
MSE: 102.29302211122773
RMSE: 10.114001290845662
R2: 0.016639555215572144
