# Cleaning the data

In [57]:
import pandas as pd

df = pd.read_csv("../data/raw/taxi_trip_pricing.csv")
df.head(3)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032


### Changing the column-names to lowercase and strip of spaces

In [58]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)
df.columns

Index(['trip_distance_km', 'time_of_day', 'day_of_week', 'passenger_count',
       'traffic_conditions', 'weather', 'base_fare', 'per_km_rate',
       'per_minute_rate', 'trip_duration_minutes', 'trip_price'],
      dtype='object')

### I want to try to fill the nan-values to keep the integrity/size of the dataset

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trip_distance_km       950 non-null    float64
 1   time_of_day            950 non-null    object 
 2   day_of_week            950 non-null    object 
 3   passenger_count        950 non-null    float64
 4   traffic_conditions     950 non-null    object 
 5   weather                950 non-null    object 
 6   base_fare              950 non-null    float64
 7   per_km_rate            950 non-null    float64
 8   per_minute_rate        950 non-null    float64
 9   trip_duration_minutes  950 non-null    float64
 10  trip_price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


In [60]:
df_filled = df.copy()

fillna_method = {
    "trip_distance_km": "median",
    "passenger_count": "median",
    "trip_duration_minutes": "median",
    "time_of_day": "mode",
    "traffic_conditions": "mode",
    "weather": "mode",
    "day_of_week": "mode",
    "base_fare": "median",
    "per_km_rate": "median",
    "per_minute_rate": "median"
}

for column, method in fillna_method.items():
    if method == "median":
        df_filled[column] = df_filled[column].fillna(df_filled[column].median())
    elif method == "mode":
        df_filled[column] = df_filled[column].fillna(df_filled[column].mode()[0])

df_filled.head()

Unnamed: 0,trip_distance_km,time_of_day,day_of_week,passenger_count,traffic_conditions,weather,base_fare,per_km_rate,per_minute_rate,trip_duration_minutes,trip_price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,3.52,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,Clear,3.48,0.51,0.15,116.81,36.4698
4,25.83,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


In [61]:
df_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trip_distance_km       1000 non-null   float64
 1   time_of_day            1000 non-null   object 
 2   day_of_week            1000 non-null   object 
 3   passenger_count        1000 non-null   float64
 4   traffic_conditions     1000 non-null   object 
 5   weather                1000 non-null   object 
 6   base_fare              1000 non-null   float64
 7   per_km_rate            1000 non-null   float64
 8   per_minute_rate        1000 non-null   float64
 9   trip_duration_minutes  1000 non-null   float64
 10  trip_price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


In [62]:
df_filled = df_filled.dropna()
df_filled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 951 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trip_distance_km       951 non-null    float64
 1   time_of_day            951 non-null    object 
 2   day_of_week            951 non-null    object 
 3   passenger_count        951 non-null    float64
 4   traffic_conditions     951 non-null    object 
 5   weather                951 non-null    object 
 6   base_fare              951 non-null    float64
 7   per_km_rate            951 non-null    float64
 8   per_minute_rate        951 non-null    float64
 9   trip_duration_minutes  951 non-null    float64
 10  trip_price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 89.2+ KB


In [67]:
df_filled.to_csv("../data/processed/taxi_fillednan_all_columns.csv", index=False)

### Since the NaN-values were evenly distributed, i also try dropping them completely

In [63]:
df_no_nan = df.dropna()
df_no_nan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 562 entries, 0 to 998
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trip_distance_km       562 non-null    float64
 1   time_of_day            562 non-null    object 
 2   day_of_week            562 non-null    object 
 3   passenger_count        562 non-null    float64
 4   traffic_conditions     562 non-null    object 
 5   weather                562 non-null    object 
 6   base_fare              562 non-null    float64
 7   per_km_rate            562 non-null    float64
 8   per_minute_rate        562 non-null    float64
 9   trip_duration_minutes  562 non-null    float64
 10  trip_price             562 non-null    float64
dtypes: float64(7), object(4)
memory usage: 52.7+ KB


### From the EDA, I decided to drop some columns
(Since they made little to no difference on the final price)

However, I first save the new "cleaned" data to a separate file, so I can try training on it later and compare.

In [64]:
df_no_nan.to_csv("../data/processed/taxi_nonan_all_columns.csv", index=False)

And one without extra columns:

In [65]:
columns_to_drop = [
    "base_fare",
    "passenger_count",
    "per_minute_rate",
]
df_dropped = df_no_nan.drop(columns=columns_to_drop, axis=1)

In [66]:
df_dropped.to_csv("../data/processed/taxi_nonan_dropped_columns.csv", index=False)

In [68]:
df_filled_nan_dropped_columns = df_filled.drop(columns=columns_to_drop, axis=1)

In [69]:
df_filled_nan_dropped_columns.to_csv("../data/processed/taxi_fillednan_dropped_columns.csv")