# PreProcessing

In [1]:
import pandas as pd

df = pd.read_csv("uber.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [2]:
df.drop(['Unnamed: 0','key'],axis =1,inplace = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        200000 non-null  float64
 1   pickup_datetime    200000 non-null  object 
 2   pickup_longitude   200000 non-null  float64
 3   pickup_latitude    200000 non-null  float64
 4   dropoff_longitude  199999 non-null  float64
 5   dropoff_latitude   199999 non-null  float64
 6   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 10.7+ MB


In [3]:
df.fillna(method='ffill', inplace=True) # Forward fill
df.isnull().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [4]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,11.359955,-72.527638,39.935885,-72.525299,39.923895,1.684535
std,9.901776,11.437787,7.720539,13.117375,6.794812,1.385997
min,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,6.0,-73.992065,40.734796,-73.991407,40.733824,1.0
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,12.5,-73.967154,40.767158,-73.963659,40.768002,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [6]:
df.drop(df[df['fare_amount'].values<=0].index,inplace=True)
df[df['fare_amount'].values<=0]

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count


In [7]:
# Convert 'pickup_datetime' to a datetime data type
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Extract the day of the week (0 = Monday, 6 = Sunday)
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(df[['passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'day_of_week']])
y = df['fare_amount']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

result = {}

# Models

## Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression

linear_reg_model = LinearRegression()

linear_reg_model.fit(X_train,y_train)
y_pred_linear=linear_reg_model.predict(X_test)

result["Linear Regression"] = {"rmse": sqrt(mean_squared_error(y_test, y_pred_linear)), "r2": r2_score(y_test, y_pred_linear)}

## Lasso Regression

In [12]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=1.0)

lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)

result["Lasso Regression"] = {"rmse": sqrt(mean_squared_error(y_test, y_pred_lasso)), "r2": r2_score(y_test, y_pred_lasso)}

## Ridge Regression

In [13]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.0)

ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

result["Ridge Regression"] = {"rmse": sqrt(mean_squared_error(y_test, y_pred_ridge)), "r2": r2_score(y_test, y_pred_ridge)}

## polynomial regression

In [14]:
from sklearn.preprocessing import PolynomialFeatures

degree = 2
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

linear_poly_model = LinearRegression()

linear_poly_model.fit(X_train_poly, y_train)
y_pred_poly = linear_poly_model.predict(X_test_poly)

result["Polynomial Regression Regression"] = {"rmse": sqrt(mean_squared_error(y_test, y_pred_poly)), "r2": r2_score(y_test, y_pred_poly)}

## Principle Component Regression

In [15]:
from sklearn.decomposition import PCA

n_components = 6
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

linear_pca_model = LinearRegression()

linear_pca_model.fit(X_train_pca, y_train)
y_pred_pca = linear_pca_model.predict(X_test_pca)

result["Principle Component Regression"] = {"rmse": sqrt(mean_squared_error(y_test, y_pred_pca)), "r2": r2_score(y_test, y_pred_pca)}

## Quantile Regressor

In [16]:
from sklearn.utils.fixes import sp_version, parse_version

solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point"

from sklearn.linear_model import QuantileRegressor

quantile_model = QuantileRegressor(alpha=0, solver=solver)

quantile_model.fit(X_train, y_train)
y_pred_quantile = quantile_model.predict(X_test)

result["Quantile Regression"] = {"rmse": sqrt(mean_squared_error(y_test, y_pred_quantile)), "r2": r2_score(y_test, y_pred_quantile)}

## Random Forest

In [17]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train,y_train)
y_pred_rf=linear_reg_model.predict(X_test)

result["Random Forest"] = {"rmse": sqrt(mean_squared_error(y_test, y_pred_rf)), "r2": r2_score(y_test, y_pred_rf)}

## PLSRegression

In [18]:
from sklearn.cross_decomposition import PLSRegression

pls_model = PLSRegression(n_components=2)

pls_model.fit(X_train, y_train)
y_pred_pls = pls_model.predict(X_test)

result["PLS Regression"] = {"rmse": sqrt(mean_squared_error(y_test, y_pred_pls)), "r2": r2_score(y_test, y_pred_pls)}

## Elastic Net

In [19]:
from sklearn.linear_model import ElasticNet


elastic_net_model = ElasticNet(alpha=0.5, l1_ratio=0.5)

elastic_net_model.fit(X_train, y_train)
y_pred_elastic = elastic_net_model.predict(X_test)

result["ElasticNet Regression"] = {"rmse": sqrt(mean_squared_error(y_test, y_pred_elastic)), "r2": r2_score(y_test, y_pred_elastic)}

# Result

In [20]:
r_df = pd.DataFrame(result)
r_df.T

Unnamed: 0,rmse,r2
Linear Regression,9.83219,7.8e-05
Lasso Regression,9.832655,-1.6e-05
Ridge Regression,9.83219,7.8e-05
Polynomial Regression Regression,496.973751,-2553.655523
Principle Component Regression,9.83219,7.8e-05
Quantile Regression,10.249301,-0.086561
Random Forest,9.83219,7.8e-05
PLS Regression,9.831716,0.000175
ElasticNet Regression,9.832655,-1.6e-05
