# Import libraries

In [None]:
import pandas as pd
import numpy as np

# Load the auto_fares_kollam dataset

In [2]:
df = pd.read_csv("../data/auto_fares_kollam.csv")
df

Unnamed: 0,ride_id,pickup_type,distance_km,time_of_day,govt_expected_fare,actual_fare,remarks
0,1,railway_station,3.0,day,53,100,Google Maps distance used; daytime trip; no wa...
1,2,railway_station,2.0,day,38,50,Short distance ride; small fare increase
2,3,railway_station,3.0,night,79,120,Night journey with valid surcharge
3,4,railway_station,5.0,day,98,150,Rainy weather; higher quoted fare


# Dataset Information

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ride_id             4 non-null      int64  
 1   pickup_type         4 non-null      object 
 2   distance_km         4 non-null      float64
 3   time_of_day         4 non-null      object 
 4   govt_expected_fare  4 non-null      int64  
 5   actual_fare         4 non-null      int64  
 6   remarks             4 non-null      object 
dtypes: float64(1), int64(3), object(3)
memory usage: 356.0+ bytes


# Statistics

In [4]:
df.describe()

Unnamed: 0,ride_id,distance_km,govt_expected_fare,actual_fare
count,4.0,4.0,4.0,4.0
mean,2.5,3.25,67.0,105.0
std,1.290994,1.258306,26.720778,42.031734
min,1.0,2.0,38.0,50.0
25%,1.75,2.75,49.25,87.5
50%,2.5,3.0,66.0,110.0
75%,3.25,3.5,83.75,127.5
max,4.0,5.0,98.0,150.0


# Fare Deviation 
#### To answer how much more or less was charged compared to kerala government rules?

In [5]:
df["fare_deviation"] = df["actual_fare"] - df["govt_expected_fare"]
df

Unnamed: 0,ride_id,pickup_type,distance_km,time_of_day,govt_expected_fare,actual_fare,remarks,fare_deviation
0,1,railway_station,3.0,day,53,100,Google Maps distance used; daytime trip; no wa...,47
1,2,railway_station,2.0,day,38,50,Short distance ride; small fare increase,12
2,3,railway_station,3.0,night,79,120,Night journey with valid surcharge,41
3,4,railway_station,5.0,day,98,150,Rainy weather; higher quoted fare,52


# To check for Overcharge

In [6]:
df["overcharged"] = df["fare_deviation"] > 0
df

Unnamed: 0,ride_id,pickup_type,distance_km,time_of_day,govt_expected_fare,actual_fare,remarks,fare_deviation,overcharged
0,1,railway_station,3.0,day,53,100,Google Maps distance used; daytime trip; no wa...,47,True
1,2,railway_station,2.0,day,38,50,Short distance ride; small fare increase,12,True
2,3,railway_station,3.0,night,79,120,Night journey with valid surcharge,41,True
3,4,railway_station,5.0,day,98,150,Rainy weather; higher quoted fare,52,True


# Normalizing Categorical Text Fields
#### To avoid inconsistencies during analysis and modeling, categorical text fields such as `time_of_day` and `pickup_type` are normalized by converting them to lowercase and removing extra spaces.

#### Although the current dataset already contains consistent lowercase values, this ensures robustness against future data additions or user inputs where variations in casing or extra spaces may occur.

In [7]:
df["time_of_day"] = df["time_of_day"].str.lower().str.strip()
df["pickup_type"] = df["pickup_type"].str.lower().str.strip()
df

Unnamed: 0,ride_id,pickup_type,distance_km,time_of_day,govt_expected_fare,actual_fare,remarks,fare_deviation,overcharged
0,1,railway_station,3.0,day,53,100,Google Maps distance used; daytime trip; no wa...,47,True
1,2,railway_station,2.0,day,38,50,Short distance ride; small fare increase,12,True
2,3,railway_station,3.0,night,79,120,Night journey with valid surcharge,41,True
3,4,railway_station,5.0,day,98,150,Rainy weather; higher quoted fare,52,True
