The goal of this exercise is to solidify your understanding of transforming classifiers into regressors and using pipelines to streamline machine learning workflows. By the end of this exercise, you should be able to:

1. Implement classification regression using KNN, SVM, Decision Trees, and Random Forests.
2. Preprocess data using transformers like StandardScaler, CategoricalEncoder.
3. Build and use pipelines to combine preprocessing steps and classification/regression models.

In [8]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/uber-fares-dataset")

print("Path to dataset files:", path)

In [10]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = pd.read_csv(r"C:/Users/Prasad/.cache/kagglehub/datasets/yasserh/uber-fares-dataset/versions/1/uber.csv")


In [12]:
df = file_path.copy()

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [16]:
df.columns

Index(['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [18]:
df.isna().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [20]:
df.dropna(inplace=True)

In [22]:
df.isna().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [24]:
df.drop(['key','Unnamed: 0','pickup_datetime'], axis=1, inplace=True)

In [26]:
df.columns

Index(['fare_amount', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [28]:
X = df.drop(['fare_amount'], axis=1)

In [30]:
X.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,-73.999817,40.738354,-73.999512,40.723217,1
1,-73.994355,40.728225,-73.99471,40.750325,1
2,-74.005043,40.74077,-73.962565,40.772647,1
3,-73.976124,40.790844,-73.965316,40.803349,3
4,-73.925023,40.744085,-73.973082,40.761247,5


In [32]:
from sklearn.preprocessing import StandardScaler

In [34]:
std_df = StandardScaler()
std_df =std_df.fit_transform(X)

In [36]:
X.columns

Index(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [38]:
std_df = pd.DataFrame(std_df, columns=['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'])

In [40]:
std_df

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,-0.128712,0.103940,-0.112387,0.117638,-0.493902
1,-0.128235,0.102628,-0.112021,0.121627,-0.493902
2,-0.129169,0.104253,-0.109570,0.124912,-0.493902
3,-0.126641,0.110739,-0.109780,0.129431,0.949109
4,-0.122173,0.104682,-0.110372,0.123235,2.392119
...,...,...,...,...,...
199994,-0.127596,0.104071,-0.111397,0.120151,-0.493902
199995,-0.127393,0.103744,-0.112933,0.120052,-0.493902
199996,-0.127506,0.106289,-0.101672,0.113130,0.227604
199997,-0.128477,0.102269,-0.111144,0.113546,-0.493902


In [42]:
y = df['fare_amount']

In [44]:
y.head()

0     7.5
1     7.7
2    12.9
3     5.3
4    16.0
Name: fare_amount, dtype: float64

# KNN Regressor

In [48]:
from sklearn.neighbors import KNeighborsRegressor

knn_regressor = KNeighborsRegressor(n_neighbors=3)
knn_regressor.fit(X, y)

print(knn_regressor.predict([[-0.128712,	0.103940,	-0.112387,	0.117638,	-0.493902]]))

[12.]




# SVM Regressor

In [59]:
from sklearn.svm import SVR

svr = SVR(kernel='rbf', C=1, epsilon=0.1)
svr.fit(X, y)

print(svr.predict([[-0.128712,	0.103940,	-0.112387,	0.117638,	-0.493902]]))

[8.53420645]




# Decision Tree Regression

In [51]:
from sklearn.tree import DecisionTreeRegressor

dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X, y)

print(dt_regressor.predict([[-0.128712,	0.103940,	-0.112387,	0.117638,	-0.493902]]))

[5.3]




## Random Forest Regressor

In [54]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=10, random_state=42)
rf_regressor.fit(X, y)

print(rf_regressor.predict([[-0.128712,	0.103940,	-0.112387,	0.117638,	-0.493902]]))

[12.52625]




## Pipeline

In [57]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()), # Transformer
    ('regressor', RandomForestRegressor()) # Estimator
])

pipeline.fit(X, y)

print(pipeline.predict([[-0.128712,	0.103940,	-0.112387,	0.117638,	-0.493902 ]]))

[10.08401189]


