# Cleaning the data

In [1]:
import pandas as pd
from taxipred.utils.constants import TAXI_CSV_RAW, CLEANED_DATA

df = pd.read_csv(TAXI_CSV_RAW)
df.head(3)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032


In [2]:
df.shape

(1000, 11)

### Changing the column-names to lowercase and strip of spaces

In [3]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)
df.columns

Index(['trip_distance_km', 'time_of_day', 'day_of_week', 'passenger_count',
       'traffic_conditions', 'weather', 'base_fare', 'per_km_rate',
       'per_minute_rate', 'trip_duration_minutes', 'trip_price'],
      dtype='object')

### Removing some columns/features

Based on EDA, these features have little to no effect on price:
- Passanger count
- Possibly base_fare and per_minute_rate, however I decided to keep these to make the prediciton more realistic

In [4]:
df = df.drop(columns="passenger_count")
df.columns, df.shape

(Index(['trip_distance_km', 'time_of_day', 'day_of_week', 'traffic_conditions',
        'weather', 'base_fare', 'per_km_rate', 'per_minute_rate',
        'trip_duration_minutes', 'trip_price'],
       dtype='object'),
 (1000, 10))

### Removing the NaN-values in target
- Supervised learning requires a "truth" for each training examples, so no NaN-values are accepted.

In [5]:
rows_before_nan = len(df)
df = df.dropna(subset=["trip_price"])
rows_after_nan = len(df)
print(f"Removed {rows_before_nan - rows_after_nan} nan-values")

Removed 49 nan-values


### Adding some rules to remove unwanted/invalid data

In [6]:
data_rules = (
    (df["trip_price"] > 0) &
    (df["trip_distance_km"] > 0) &
    (df["trip_duration_minutes"] > 0)
)
rows_before_rules = len(df)

df = df[data_rules]

rows_after_rules = len(df)
print(f"Removed {rows_before_rules - rows_after_rules} rows by applying rules")

Removed 93 rows by applying rules


In [7]:
numeric_columns = df.select_dtypes(include="number").columns

### Removing outliers from target, based on EDA

In [8]:
Q1 = df[numeric_columns].quantile(0.25)
Q3 = df[numeric_columns].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

rows_before = len(df)

mask = ~(
    (df[numeric_columns] < lower) |
    (df[numeric_columns] > upper)
).any(axis=1)

df = df[mask]

rows_after = len(df)

print(f"Removed {rows_before - rows_after} outliers")

Removed 25 outliers


In [9]:
df["trip_distance_km"].sort_values(ascending=False).head(20)

531    49.99
847    49.85
475    49.66
390    49.53
705    49.51
518    49.38
69     49.36
441    49.35
661    49.34
499    49.32
154    49.30
810    49.26
932    49.24
675    49.14
581    49.02
305    48.82
498    48.75
226    48.68
139    48.62
471    48.52
Name: trip_distance_km, dtype: float64

In [10]:
df["trip_price"].sort_values(ascending=False).head(20)

245    118.0321
278    117.7468
385    116.4206
478    114.9417
140    110.2544
560    109.8965
441    109.4796
810    109.4304
351    108.9734
758    108.2325
265    107.7025
437    106.2963
115    106.0042
93     105.9418
248    105.1440
76     104.2421
50     104.1764
717    104.1555
869    102.8850
200    102.8806
Name: trip_price, dtype: float64

In [11]:
df["per_minute_rate"].sort_values(ascending=False).head(3)

964    0.5
43     0.5
601    0.5
Name: per_minute_rate, dtype: float64

### Keeping NaN in features

I will handle the feature-NaN-values in pipeline/after train test split, to avoid data leakage

### Saving cleaned CSV

In [12]:
df.to_csv(CLEANED_DATA/"taxi_prices_cleaned.csv", index=False)