# Import libraries

In [25]:
import pandas as pd
import numpy as np

# Load the auto_fares_kollam dataset

In [26]:
df = pd.read_csv("../data/auto_fares_kollam.csv")
df

Unnamed: 0,ride_id,pickup_type,distance_km,time_of_day,govt_expected_fare,actual_fare,remarks
0,1,railway_station,3.0,day,53,100,Google Maps distance used; daytime trip; no wa...
1,2,railway_station,2.0,day,38,50,Short distance ride; small fare increase
2,3,railway_station,3.0,night,79,120,Night journey with valid surcharge
3,4,railway_station,5.0,day,98,150,Rainy weather; higher quoted fare


# Dataset Information

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ride_id             4 non-null      int64  
 1   pickup_type         4 non-null      object 
 2   distance_km         4 non-null      float64
 3   time_of_day         4 non-null      object 
 4   govt_expected_fare  4 non-null      int64  
 5   actual_fare         4 non-null      int64  
 6   remarks             4 non-null      object 
dtypes: float64(1), int64(3), object(3)
memory usage: 356.0+ bytes


# Statistics

In [28]:
df.describe()

Unnamed: 0,ride_id,distance_km,govt_expected_fare,actual_fare
count,4.0,4.0,4.0,4.0
mean,2.5,3.25,67.0,105.0
std,1.290994,1.258306,26.720778,42.031734
min,1.0,2.0,38.0,50.0
25%,1.75,2.75,49.25,87.5
50%,2.5,3.0,66.0,110.0
75%,3.25,3.5,83.75,127.5
max,4.0,5.0,98.0,150.0


# Fare Deviation 
#### To answer how much more or less was charged compared to kerala government rules?

In [29]:
df["fare_deviation"] = df["actual_fare"] - df["govt_expected_fare"]
df

Unnamed: 0,ride_id,pickup_type,distance_km,time_of_day,govt_expected_fare,actual_fare,remarks,fare_deviation
0,1,railway_station,3.0,day,53,100,Google Maps distance used; daytime trip; no wa...,47
1,2,railway_station,2.0,day,38,50,Short distance ride; small fare increase,12
2,3,railway_station,3.0,night,79,120,Night journey with valid surcharge,41
3,4,railway_station,5.0,day,98,150,Rainy weather; higher quoted fare,52


# To check for Overcharge

In [30]:
df["overcharged"] = df["fare_deviation"] > 0
df

Unnamed: 0,ride_id,pickup_type,distance_km,time_of_day,govt_expected_fare,actual_fare,remarks,fare_deviation,overcharged
0,1,railway_station,3.0,day,53,100,Google Maps distance used; daytime trip; no wa...,47,True
1,2,railway_station,2.0,day,38,50,Short distance ride; small fare increase,12,True
2,3,railway_station,3.0,night,79,120,Night journey with valid surcharge,41,True
3,4,railway_station,5.0,day,98,150,Rainy weather; higher quoted fare,52,True


# Normalizing Categorical Text Fields
#### To avoid inconsistencies during analysis and modeling, categorical text fields such as `time_of_day` and `pickup_type` are normalized by converting them to lowercase and removing extra spaces.

#### Although the current dataset already contains consistent lowercase values, this ensures robustness against future data additions or user inputs where variations in casing or extra spaces may occur.

In [31]:
df["time_of_day"] = df["time_of_day"].str.lower().str.strip()
df["pickup_type"] = df["pickup_type"].str.lower().str.strip()
df

Unnamed: 0,ride_id,pickup_type,distance_km,time_of_day,govt_expected_fare,actual_fare,remarks,fare_deviation,overcharged
0,1,railway_station,3.0,day,53,100,Google Maps distance used; daytime trip; no wa...,47,True
1,2,railway_station,2.0,day,38,50,Short distance ride; small fare increase,12,True
2,3,railway_station,3.0,night,79,120,Night journey with valid surcharge,41,True
3,4,railway_station,5.0,day,98,150,Rainy weather; higher quoted fare,52,True


# Choosing Features for the First Model

Before building any model, it’s important to clearly decide what information the model should learn from.  
For this initial version, I’ve intentionally kept the feature set small and simple to make the model easy to understand and explain.

Trip distance and time of day are chosen as the primary inputs, since these factors directly influence auto fare calculation in real-world scenarios.  
Starting with fewer features also helps avoid unnecessary complexity at an early stage of the project.


In [32]:
X = df[["distance_km", "time_of_day"]]
y = df["actual_fare"]

# Encoding Time of Day

Since ML models work with numerical values, the `time_of_day` feature needs to be converted into a numeric format.  
As this feature has only two possible values (day and night), a simple binary encoding is used where day is represented as 0 and night as 1.

This approach keeps the model interpretable and avoids unnecessary complexity at this stage.

In [33]:
# Encode : day = 0 and night = 1
df["time_of_day_encoded"] = df["time_of_day"].map({"day": 0, "night": 1})

df[["time_of_day", "time_of_day_encoded"]]

Unnamed: 0,time_of_day,time_of_day_encoded
0,day,0
1,day,0
2,night,1
3,day,0


In [34]:
X = df[["distance_km", "time_of_day_encoded"]]
y = df["actual_fare"]

In [35]:
print("Features (X):")
display(X)

print("\nTarget (y):")
display(y)

Features (X):


Unnamed: 0,distance_km,time_of_day_encoded
0,3.0,0
1,2.0,0
2,3.0,1
3,5.0,0



Target (y):


0    100
1     50
2    120
3    150
Name: actual_fare, dtype: int64

# Train the linear regression model

In [36]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X, y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


# Make predictions

In [37]:
df["predicted_fare"] = model.predict(X)

df[["distance_km", "time_of_day", "actual_fare", "predicted_fare"]]

Unnamed: 0,distance_km,time_of_day,actual_fare,predicted_fare
0,3.0,day,100,89.285714
1,2.0,day,50,57.142857
2,3.0,night,120,120.0
3,5.0,day,150,153.571429


## Comparing Government Fare and Predicted Real-World Fare

At this stage, the government-expected fare and the ML-predicted fare are compared to understand how real-world charging behavior deviates from official pricing rules.  
This comparison helps highlight gaps between fair pricing and commonly observed fare quotations.

In [38]:
df[[
    "distance_km",
    "time_of_day",
    "govt_expected_fare",
    "actual_fare",
    "predicted_fare"
]]

Unnamed: 0,distance_km,time_of_day,govt_expected_fare,actual_fare,predicted_fare
0,3.0,day,53,100,89.285714
1,2.0,day,38,50,57.142857
2,3.0,night,79,120,120.0
3,5.0,day,98,150,153.571429


## Overcharge Risk Indicator
#### 20% is large enough to allow normal variation

To make the comparison easier for users, an overcharge risk indicator is derived by comparing the quoted fare with the ML-predicted real-world fare.  
Instead of showing only raw numbers, this indicator categorizes the fare into low, medium, or high risk levels, helping users quickly understand whether a quoted price is unusually high.


In [40]:
def overcharge_risk(actual, predicted):
    if actual <= predicted:
        return "Low"
    elif actual <= 1.2 * predicted:
        return "Medium"
    else:
        return "High"

df["overcharge_risk"] = df.apply(
    lambda row: overcharge_risk(row["actual_fare"], row["predicted_fare"]),
    axis=1
)

df[[
    "distance_km",
    "time_of_day",
    "govt_expected_fare",
    "predicted_fare",
    "actual_fare",
    "overcharge_risk"
]]

Unnamed: 0,distance_km,time_of_day,govt_expected_fare,predicted_fare,actual_fare,overcharge_risk
0,3.0,day,53,89.285714,100,Medium
1,2.0,day,38,57.142857,50,Low
2,3.0,night,79,120.0,120,Low
3,5.0,day,98,153.571429,150,Low


## Summary of Model Behavior

At this stage, the regression model is able to capture basic fare patterns based on distance and time of day using a small, real-world dataset.  
The model is not intended to provide exact fare predictions, but to estimate typical charging behavior, which is then compared against government-prescribed fares to assess potential overcharging risk.

This completes the initial machine learning pipeline for the project.
