# Test and compare datasets
- df_full is the original data, where we just drop all rows with missing values
- df_cleaned is the cleaned data with filled values

In [1]:
import pandas as pd

df_full = pd.read_csv("../src/taxipred/data/taxi_trip_pricing.csv").dropna()

df_cleaned = pd.read_csv("../src/taxipred/data/taxi_trip_pricing_cleaned.csv", index_col=0)

df_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 562 entries, 0 to 998
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       562 non-null    float64
 1   Time_of_Day            562 non-null    object 
 2   Day_of_Week            562 non-null    object 
 3   Passenger_Count        562 non-null    float64
 4   Traffic_Conditions     562 non-null    object 
 5   Weather                562 non-null    object 
 6   Base_Fare              562 non-null    float64
 7   Per_Km_Rate            562 non-null    float64
 8   Per_Minute_Rate        562 non-null    float64
 9   Trip_Duration_Minutes  562 non-null    float64
 10  Trip_Price             562 non-null    float64
dtypes: float64(7), object(4)
memory usage: 52.7+ KB


In [2]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 925 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       925 non-null    float64
 1   Time_of_Day            925 non-null    object 
 2   Day_of_Week            925 non-null    object 
 3   Passenger_Count        925 non-null    float64
 4   Traffic_Conditions     925 non-null    object 
 5   Weather                925 non-null    object 
 6   Base_Fare              925 non-null    float64
 7   Per_Km_Rate            925 non-null    float64
 8   Per_Minute_Rate        925 non-null    float64
 9   Trip_Duration_Minutes  925 non-null    float64
 10  Trip_Price             925 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.7+ KB


## Encode data with one-hot encoding

In [3]:
df_cleaned = pd.get_dummies(df_cleaned, drop_first=True)
df_full = pd.get_dummies(df_full, drop_first=True)
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 925 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Trip_Distance_km           925 non-null    float64
 1   Passenger_Count            925 non-null    float64
 2   Base_Fare                  925 non-null    float64
 3   Per_Km_Rate                925 non-null    float64
 4   Per_Minute_Rate            925 non-null    float64
 5   Trip_Duration_Minutes      925 non-null    float64
 6   Trip_Price                 925 non-null    float64
 7   Time_of_Day_Evening        925 non-null    bool   
 8   Time_of_Day_Morning        925 non-null    bool   
 9   Time_of_Day_Night          925 non-null    bool   
 10  Day_of_Week_Weekend        925 non-null    bool   
 11  Traffic_Conditions_Low     925 non-null    bool   
 12  Traffic_Conditions_Medium  925 non-null    bool   
 13  Weather_Rain               925 non-null    bool   
 14 

In [4]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 562 entries, 0 to 998
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Trip_Distance_km           562 non-null    float64
 1   Passenger_Count            562 non-null    float64
 2   Base_Fare                  562 non-null    float64
 3   Per_Km_Rate                562 non-null    float64
 4   Per_Minute_Rate            562 non-null    float64
 5   Trip_Duration_Minutes      562 non-null    float64
 6   Trip_Price                 562 non-null    float64
 7   Time_of_Day_Evening        562 non-null    bool   
 8   Time_of_Day_Morning        562 non-null    bool   
 9   Time_of_Day_Night          562 non-null    bool   
 10  Day_of_Week_Weekend        562 non-null    bool   
 11  Traffic_Conditions_Low     562 non-null    bool   
 12  Traffic_Conditions_Medium  562 non-null    bool   
 13  Weather_Rain               562 non-null    bool   
 14 

## Use test_models function to return a dataframe with performance score for different models and scalers

In [7]:
from model_testing import test_models

result_full = test_models(df_full, "Trip_Price")
result_full["dataset"] = "df_full"

result_cleaned = test_models(df_cleaned, "Trip_Price")
result_cleaned["dataset"] = "df_cleaned"


results = pd.concat([result_full, result_cleaned], ignore_index=True)
results.sort_values(by="rmse").reset_index(drop=True)



Unnamed: 0,model,scaler,mae,mse,rmse,dataset
0,RandomForestRegressor,StandardScaler,4.343948,44.674463,6.683896,df_cleaned
1,RandomForestRegressor,MinMaxScaler,4.526564,46.24885,6.800651,df_cleaned
2,RandomForestRegressor,No Scaler,4.573389,47.836114,6.916366,df_cleaned
3,XGBRegressor,StandardScaler,5.041203,52.002562,7.21128,df_cleaned
4,XGBRegressor,MinMaxScaler,5.041203,52.002562,7.21128,df_cleaned
5,XGBRegressor,No Scaler,5.041203,52.002562,7.21128,df_cleaned
6,ElasticNetCV,StandardScaler,6.569476,70.447528,8.393303,df_cleaned
7,ElasticNetCV,MinMaxScaler,6.596511,70.800735,8.414317,df_cleaned
8,RidgeCV,No Scaler,6.591788,70.812323,8.415006,df_cleaned
9,RidgeCV,MinMaxScaler,6.59007,70.815279,8.415181,df_cleaned


### df_cleaned got better score overall, random forest got best score, and standard scaler got better score

**conclusion for the dataset:**
- use cleaned dataset
    - standard scaler is better for this dataset if scaling is needed
    - random forest model is the best model for this dataset