## Importing and loading data from data.gov.my
Daily-frequency ridership data for the 5 main KTMB services, namely Komuter, Komuter Utara, Intercity, ETS and Shuttle Tebrau between 2020 to 2025.

In [21]:
import pandas as pd

#Loading the data and looking at our data
df = pd.read_csv("ridership_ktmb_daily.csv")
df.head()


Unnamed: 0,date,service,ridership
0,2020-10-15,ets,737
1,2020-10-16,ets,1065
2,2020-10-17,ets,856
3,2020-10-18,ets,1037
4,2020-10-19,ets,845


Checking how many rows we have

In [23]:
numrows = df.shape[0]
print("Number of rows:", numrows)


Number of rows: 7573


Computing mean, median and mode of ridership for 2025

In [31]:
#Assigning the numeric column we're interested in to a "ridership" variable
ridership = df["ridership"]

# Computing the mean, median and mode for ridership
mean_val = ridership.mean()
median_val = ridership.median()
mode_val = ridership.mode()[0]

print("Mean:", mean_val)
print("Median:", median_val)
print("Mode:", mode_val)


Mean: 9951.979268453717
Median: 8383.0
Mode: 117


## Without using pandas

In [32]:
#Reading CSV and extracting ridership data

#Creating a list for ridership
ridership = []

#Using loops to extract ridership column from the CSV
with open("ridership_ktmb_daily.csv", "r") as f:
    next(f)
    for line in f:
        parts = line.strip().split(",")
        if parts[2]:
            ridership.append(int(parts[2]))

In [33]:
#Computing mean (average) of ridership over the 5 years

total = sum(ridership)
count = len(ridership)
mean_val = total / count
print("Mean:", mean_val)

Mean: 9951.979268453717


In [34]:
#Computing median of the ridership over the 5 years
sorted_r = sorted(ridership)
n = len(sorted_r)
if n % 2 == 1:  
    median_val = sorted_r[n // 2]
else: 
    median_val = (sorted_r[n // 2 - 1] + sorted_r[n // 2]) / 2
print("Median:", median_val)


Median: 8383


In [None]:
#Computing mode 
#I struggled with writing the code for this, so I used AI to help me understand
#how I can use a dictionary to help with computing the mode.

counts = {}
for val in ridership:
    counts[val] = counts.get(val, 0) + 1  # increment count for each value

max_count = max(counts.values())  # find highest frequency
mode_val = [k for k, v in counts.items() if v == max_count][0]  # pick first mode
print("Mode:", mode_val)

Mode: 117


## Data Visualization for KTMB Ridership Between 2020 - 2025

In [39]:
#Examining total ridership for each year between 2020 - 2025
df["date"] = pd.to_datetime(df["date"])

# Extract year
df["year"] = df["date"].dt.year

# Group by year and sum ridership
yearly_totals = df.groupby("year")["ridership"].sum()

print(yearly_totals)

year
2020      536931
2021     1985380
2022     9073854
2023    16324981
2024    26464882
2025    20980311
Name: ridership, dtype: int64


In [41]:
#For the data visualisation part, I will use sparklines
yearly_totals = pd.Series(
    [536931, 1985380, 9073854, 16324981, 26464882, 20980311],
    index=[2020, 2021, 2022, 2023, 2024, 2025],
)

# Define unit per star (rough estimation where one '*' is 500k)
unit = 500_000

print("Yearly Ridership (each * = 500,000)\n")
for year, val in yearly_totals.items():
    stars = int(val / unit)
    print(f"{year}: {'*' * stars} ({val})")

Yearly Ridership (each * = 500,000)

2020: * (536931)
2021: *** (1985380)
2022: ****************** (9073854)
2023: ******************************** (16324981)
2024: **************************************************** (26464882)
2025: ***************************************** (20980311)
