# Cleaning the data

In [47]:
import pandas as pd
from taxipred.utils.constants import TAXI_CSV_RAW

df = pd.read_csv(TAXI_CSV_RAW)
df.head(3)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032


In [48]:
df.shape

(1000, 11)

### Changing the column-names to lowercase and strip of spaces

In [49]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)
df.columns

Index(['trip_distance_km', 'time_of_day', 'day_of_week', 'passenger_count',
       'traffic_conditions', 'weather', 'base_fare', 'per_km_rate',
       'per_minute_rate', 'trip_duration_minutes', 'trip_price'],
      dtype='object')

### Removing the NaN-values in target
- Supervised learning requires a "truth" for each training examples, so no NaN-values are accepted.

In [50]:
rows_before_nan = len(df)
df = df.dropna(subset=["trip_price"])
rows_after_nan = len(df)
print(f"Removed {rows_before_nan - rows_after_nan} nan-values")

Removed 49 nan-values


### Adding some rules to remove unwanted/invalid data

In [51]:
data_rules = (
    (df["trip_price"] > 0) &
    (df["trip_distance_km"] > 0) &
    (df["trip_duration_minutes"] > 0)
)
rows_before_rules = len(df)

df = df[data_rules]

rows_after_rules = len(df)
print(f"Removed {rows_before_rules - rows_after_rules} rows by applying rules")

Removed 93 rows by applying rules


### Removing outliers from target, based on EDA

In [52]:
Q1 = df["trip_price"].quantile(0.25)
Q3 = df["trip_price"].quantile(0.75)
IQR = Q3 - Q1

upper = Q3 + 1.5 * IQR

rows_before_outliers = len(df)

df = df[df["trip_price"] <= upper]

rows_after_outliers = len(df)

print(f"Removed {rows_before_outliers - rows_after_outliers} outliers")

Removed 25 outliers


### Removing some columns/features

Based on EDA, these features have little to no effect on price:
- Passanger count
- Possibly base_fare and per_minute_rate, however I decided to keep these to make the prediciton more realistic

In [53]:
df = df.drop(columns="passenger_count")
df.columns, df.shape

(Index(['trip_distance_km', 'time_of_day', 'day_of_week', 'traffic_conditions',
        'weather', 'base_fare', 'per_km_rate', 'per_minute_rate',
        'trip_duration_minutes', 'trip_price'],
       dtype='object'),
 (833, 10))