In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [3]:

# Load your dataset
# Replace 'uber_data.csv' with your dataset path
df = pd.read_csv('uber.csv')


In [5]:

# 1. Pre-process the dataset

# Drop rows with missing values (optional based on data quality)
df.dropna(inplace=True)


In [6]:

# Feature Engineering (Example for columns like pickup and dropoff coordinates)
# Convert pickup and dropoff coordinates to numerical distance (Euclidean)
def calculate_distance(lat1, lon1, lat2, lon2):
    return np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)

df['distance'] = calculate_distance(df['pickup_latitude'], df['pickup_longitude'],
                                    df['dropoff_latitude'], df['dropoff_longitude'])



In [7]:
# Select relevant features
X = df[['distance']]  # Add other relevant features as needed
y = df['fare_amount']  # Assuming 'fare_amount' is the target variable



In [8]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:

# Standardize the features (important for linear regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:

# 2. Implement linear regression and random forest regression models

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)


In [11]:

# Random Forest Regression Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


In [12]:

# 3. Evaluate the models and compare their respective scores

# Define a function to print evaluation metrics
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - RMSE: {rmse:.2f}, R^2 Score: {r2:.2f}")


In [13]:

# Evaluate Linear Regression
evaluate_model(y_test, y_pred_lr, "Linear Regression")


Linear Regression - RMSE: 10.20, R^2 Score: 0.00


In [14]:

# Evaluate Random Forest Regression
evaluate_model(y_test, y_pred_rf, "Random Forest Regression")


Random Forest Regression - RMSE: 6.48, R^2 Score: 0.60
