# Transportation Data Analysis (Python)

**Goal:** explore delay patterns by hour, route, and seasonality.

> Replace the synthetic CSV with a real dataset anytime.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('data/transportation_synthetic.csv')
df['date'] = pd.to_datetime(df['date'])
df.head()


## Data quality checks


In [None]:
df.info()
df.isna().sum()


In [None]:
df = df.drop_duplicates()
df['month'] = df['date'].dt.to_period('M').astype(str)
df.describe()


## 1) Delay distribution


In [None]:
plt.figure()
df['delay_minutes'].hist(bins=30)
plt.title('Delay Minutes Distribution')
plt.xlabel('Delay minutes')
plt.ylabel('Trips')
plt.tight_layout()
plt.show()


## 2) Average delay by hour


In [None]:
avg_by_hour = df.groupby('hour', as_index=False)['delay_minutes'].mean()
plt.figure()
plt.plot(avg_by_hour['hour'], avg_by_hour['delay_minutes'])
plt.title('Average Delay by Hour')
plt.xlabel('Hour of day')
plt.ylabel('Avg delay (min)')
plt.xticks(range(0,24,2))
plt.tight_layout()
plt.show()
avg_by_hour.sort_values('delay_minutes', ascending=False).head(5)


## 3) Average delay by route


In [None]:
avg_by_route = df.groupby('route', as_index=False)['delay_minutes'].mean().sort_values('delay_minutes', ascending=False)
plt.figure()
plt.bar(avg_by_route['route'], avg_by_route['delay_minutes'])
plt.title('Average Delay by Route')
plt.xlabel('Route')
plt.ylabel('Avg delay (min)')
plt.tight_layout()
plt.show()
avg_by_route


## 4) Monthly trend


In [None]:
monthly = df.groupby('month', as_index=False)['delay_minutes'].mean()
plt.figure()
plt.plot(monthly['month'], monthly['delay_minutes'])
plt.title('Average Delay by Month')
plt.xlabel('Month')
plt.ylabel('Avg delay (min)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
monthly


## Summary (edit this)

- Peak delays occur during commuting hours (morning/evening).
- Some routes show consistently higher average delays.
- Seasonality exists with small monthly variation.

**Next steps:** replace the synthetic data with a real dataset and add deeper segmentation.
