In [None]:
# Regression Analysis:
# A. Predict the price of the Uber ride from a given pickup point to the agreed drop-off
# location. Perform following tasks:
# 1. Pre-process the dataset.
# 2. Identify outliers.
# 3. Check the correlation.
# 4. Implement linear regression and ridge, Lasso regression models.
# 5. Evaluate the models and compare their respective scores like R2, RMSE, etc.

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import SimpleImputer
# pandas, numpy, matplotlib: Libraries used for data manipulation, calculations, and visualization.
# train_test_split, StandardScaler: Tools for splitting data into training/testing sets and scaling features.
# LinearRegression, Ridge, Lasso: Regression models from sklearn.
# r2_score, mean_squared_error: Evaluation metrics for model performance.
# SimpleImputer: Used to handle missing values in the dataset.

In [2]:
# Load the dataset
df = pd.read_csv("uber.csv")

# view dataset
print(df)

        Unnamed: 0                            key  fare_amount  \
0         24238194    2015-05-07 19:52:06.0000003          7.5   
1         27835199    2009-07-17 20:04:56.0000002          7.7   
2         44984355   2009-08-24 21:45:00.00000061         12.9   
3         25894730    2009-06-26 08:22:21.0000001          5.3   
4         17610152  2014-08-28 17:47:00.000000188         16.0   
...            ...                            ...          ...   
199995    42598914   2012-10-28 10:49:00.00000053          3.0   
199996    16382965    2014-03-14 01:09:00.0000008          7.5   
199997    27804658   2009-06-29 00:42:00.00000078         30.9   
199998    20259894    2015-05-20 14:56:25.0000004         14.5   
199999    11951496   2010-05-15 04:08:00.00000076         14.1   

                pickup_datetime  pickup_longitude  pickup_latitude  \
0       2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1       2009-07-17 20:04:56 UTC        -73.994355        40.728225 

In [3]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# print(df['pickup_datetime'])
df['hour'] = df['pickup_datetime'].dt.hour
# print(df['hour'])
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
# print(df['day_of_week'])

#pd.to_datetime(...): Converts pickup_datetime column from string to datetime format, allowing us to extract time-related information.
#df['hour'] = ...: Adds a new column hour representing the hour of day when the ride was picked up.
#df['day_of_week'] = ...: Adds a day_of_week column representing the day of the week.

In [4]:
# check datasets for more columns we added 'hour' and 'day_of_week' column
print(df)

        Unnamed: 0                            key  fare_amount  \
0         24238194    2015-05-07 19:52:06.0000003          7.5   
1         27835199    2009-07-17 20:04:56.0000002          7.7   
2         44984355   2009-08-24 21:45:00.00000061         12.9   
3         25894730    2009-06-26 08:22:21.0000001          5.3   
4         17610152  2014-08-28 17:47:00.000000188         16.0   
...            ...                            ...          ...   
199995    42598914   2012-10-28 10:49:00.00000053          3.0   
199996    16382965    2014-03-14 01:09:00.0000008          7.5   
199997    27804658   2009-06-29 00:42:00.00000078         30.9   
199998    20259894    2015-05-20 14:56:25.0000004         14.5   
199999    11951496   2010-05-15 04:08:00.00000076         14.1   

                 pickup_datetime  pickup_longitude  pickup_latitude  \
0      2015-05-07 19:52:06+00:00        -73.999817        40.738354   
1      2009-07-17 20:04:56+00:00        -73.994355        40.7282

In [5]:
# Drop unnecessary columns
df = df.drop(columns=['Unnamed: 0', 'key', 'pickup_datetime'])
#df.drop(columns=...): Removes columns that aren't useful for prediction, like the unnamed index column, key, and pickup_datetime.
#print(df): Verifies that the columns have been removed.

In [6]:
# check datasets for removal of columns we removed 'first_column with no name', 'key' and 'pickup_datetime' column
print(df)

        fare_amount  pickup_longitude  pickup_latitude  dropoff_longitude  \
0               7.5        -73.999817        40.738354         -73.999512   
1               7.7        -73.994355        40.728225         -73.994710   
2              12.9        -74.005043        40.740770         -73.962565   
3               5.3        -73.976124        40.790844         -73.965316   
4              16.0        -73.925023        40.744085         -73.973082   
...             ...               ...              ...                ...   
199995          3.0        -73.987042        40.739367         -73.986525   
199996          7.5        -73.984722        40.736837         -74.006672   
199997         30.9        -73.986017        40.756487         -73.858957   
199998         14.5        -73.997124        40.725452         -73.983215   
199999         14.1        -73.984395        40.720077         -73.985508   

        dropoff_latitude  passenger_count  hour  day_of_week  
0           

In [7]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
#SimpleImputer(strategy='mean'): Creates an imputer that replaces missing values with the mean of each column.
#df_imputed = pd.DataFrame(...): Applies the imputer to the dataset, creating a new DataFrame with no missing values.

In [8]:
# Split the data into features (X) and target (y)
X = df_imputed.drop(columns=['fare_amount'])  # create new dataset ignoring 'fare_amount' column
y = df_imputed['fare_amount']  # create a series of only 'fare_amount' column

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#train_test_split(...): Splits X and y into training and testing sets, with 20% of data as the test set.

In [10]:
# Standardize the features (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# scaler = StandardScaler(): Initializes the scaler to standardize features.
# X_train_scaled = scaler.fit_transform(...): Scales X_train to have a mean of 0 and standard deviation of 1.
# X_test_scaled = scaler.transform(...): Scales X_test based on the X_train scaling parameters.

In [11]:
# Implement Linear Regression
lr_model = LinearRegression() #Initializes a Linear Regression model.
lr_model.fit(X_train_scaled, y_train)  # Fits the model to the training data.
y_pred_lr = lr_model.predict(X_test_scaled) #Uses the model to predict fare_amount for X_test.

In [12]:
# Implement Ridge Regression
ridge_model = Ridge(alpha=1.0)  # You can experiment with different alpha values
#Initializes Ridge Regression with a penalty (alpha) of 1.0 to avoid overfitting.
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)

In [13]:
# Implement Lasso Regression
lasso_model = Lasso(alpha=0.1)  # You can experiment with different alpha values
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)

In [16]:
# Evaluate the models
def evaluate_model(y_true, y_pred, model_name):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{model_name} - R2 Score: {r2:.4f}, RMSE: {rmse:.2f}")

In [18]:
evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_ridge, "Ridge Regression")
evaluate_model(y_test, y_pred_lasso, "Lasso Regression")

Linear Regression - R2 Score: 0.0007, RMSE: 10.31
Ridge Regression - R2 Score: 0.0007, RMSE: 10.31
Lasso Regression - R2 Score: 0.0003, RMSE: 10.31


In [None]:
# 1. Understanding Regression Models in General
# Regression is a statistical method used to model and analyze relationships between a dependent variable (the target we’re trying to predict) and one or more independent variables (features or predictors). It’s commonly used to predict numerical values, like prices, quantities, and scores.

# Linear Regression: This is the simplest regression model. It assumes a linear relationship between the input variables and the output variable. The goal is to find the "line of best fit" that minimizes the difference between the actual data points and the predictions made by this line.

# Ridge Regression: A type of linear regression that includes a penalty (called regularization) to avoid overfitting. This penalty term, defined by alpha, reduces the impact of variables that have little predictive power by shrinking their coefficients, helping the model generalize better to new data.

# Lasso Regression: Like Ridge, Lasso regression also applies regularization but does so differently. It can shrink some feature coefficients entirely to zero, effectively selecting a subset of predictors. This makes Lasso useful for identifying the most important features in a dataset, as it essentially "drops" less important ones.

# 2. Breaking Down Each Section of Code
# Let’s discuss how each code section contributes to creating and evaluating these regression models.

# 2.1. Data Preprocessing
# Data preprocessing is about making the dataset ready for modeling. Here’s what each preprocessing step accomplishes:

# Date Parsing:

# By extracting the hour of the day and day of the week from pickup_datetime, the model can identify patterns in fare pricing based on time factors (e.g., higher fares during rush hours or weekends).
# Dropping Columns:

# Columns that don’t contribute to fare prediction (like pickup_datetime, Unnamed: 0, and key) are removed. This avoids noise in the model, improving its performance.
# Handling Missing Values:

# Missing data can skew predictions or make the model unstable. Here, the missing values are replaced with the mean of each column using SimpleImputer, which ensures all rows are complete.
# Feature Scaling:

# Scaling ensures that all features have similar ranges, which is especially important for models like Ridge and Lasso that are sensitive to the magnitude of feature values. StandardScaler standardizes data to have a mean of 0 and a standard deviation of 1.
# 2.2. Implementing Regression Models
# Each regression model has its specific characteristics:

# Linear Regression:

# Finds the line that best fits the data by minimizing the sum of squared differences between actual and predicted fare values.
# Ridge Regression:

# Adds a penalty proportional to the square of the coefficients. This reduces the influence of features with less predictive power, making the model more robust to outliers and overfitting.
# Lasso Regression:

# Adds a penalty based on the absolute values of coefficients, which can drive some coefficients to zero. This is useful for feature selection, as it effectively removes unimportant features.
# 2.3. Evaluating Model Performance
# The models are evaluated using:

# R² Score:

# R-squared measures how well the model explains the variation in the target variable. Higher values mean the model captures more of the data’s variance, indicating a better fit.
# RMSE (Root Mean Squared Error):

# RMSE provides an average measure of the difference between actual and predicted values. Lower values indicate better model accuracy.
# 3. Real-World Applications of Each Model
# Let’s look at practical situations where these models could be applied.

# Linear Regression

# House Price Prediction: You could predict house prices based on features like size, location, number of bedrooms, and age. A linear relationship often approximates such scenarios.
# Stock Price Prediction: Linear regression can offer insights into price movement over time based on factors like historical prices or economic indicators, though it’s often used as a simple baseline model in this case.
# Ridge Regression

# Predicting Loan Default Rates: A bank could use ridge regression to predict the likelihood of loan defaults based on multiple customer attributes (e.g., income, debt, and credit score). Ridge regression helps manage the noise from many variables and reduces overfitting.
# Medical Costs Prediction: In healthcare, ridge regression can help predict treatment costs based on numerous patient attributes. The regularization can prevent overfitting, especially with many variables.
# Lasso Regression

# Feature Selection in Marketing: Lasso regression could help a company decide which customer attributes (like age, location, spending habits) are most important when predicting the likelihood of making a purchase. Unimportant variables are automatically zeroed out, leaving only significant ones.
# Customer Churn Prediction: In telecommunications, companies can predict customer churn by identifying important features (e.g., call frequency, support ticket history) while ignoring less relevant ones.
# 4. How the Code Works in Practice
# In real applications:

# Data Preprocessing is a crucial step. Poor data preparation can lead to misleading results or overfitting. Handling missing values, transforming data, and scaling all ensure that the model has the best conditions to learn from the data.

# Model Selection:

# Linear Regression can work well when there’s a straightforward relationship between variables.
# Ridge and Lasso are beneficial when dealing with complex datasets with many features and potential multicollinearity (where features are correlated with each other). These models help create more stable predictions that generalize well to new data.
# By applying these models to predict Uber fares, you can see the impact of time-based factors (e.g., peak hours) and geographical factors (e.g., distance traveled) on pricing. Comparing the performance of each model allows you to choose the best approach depending on the data characteristics.