In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve


In [2]:
 
np.random.seed(42)
n = 2000  # মোট রেকর্ড সংখ্যা

# Deprecated warning ঠিক করতে freq='h' ব্যবহার করা হলো
date_range = pd.date_range(start='2025-01-01', periods=n, freq='h')  
routes = ['R1', 'R2', 'R3', 'R4']
stops = [f'S{num}' for num in range(1, 21)]
weathers = ['Clear', 'Rainy', 'Cloudy']

In [3]:

data = pd.DataFrame({
    'timestamp': date_range,
    'route': np.random.choice(routes, n, p=[0.35,0.30,0.20,0.15]),
    'stop_id': np.random.choice(stops, n),
    'passenger_count': np.random.poisson(lam=25, size=n),
    'boardings': np.random.poisson(lam=5, size=n),
    'alightings': np.random.poisson(lam=4, size=n),
    'weather': np.random.choice(weathers, n, p=[0.6,0.25,0.15]),
    'is_holiday': np.random.choice([0,1], n, p=[0.95,0.05])
})


In [4]:

data['passenger_count'] = (data['passenger_count'] + data['boardings'] - data['alightings']).clip(lower=0)

data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.day_name()
capacity_map = {'R1':55, 'R2':50, 'R3':45, 'R4':40}
data['capacity'] = data['route'].map(capacity_map)
data['is_peak'] = data['hour'].isin([7,8,9,17,18,19]).astype(int)

In [5]:

# ঘণ্টা, সপ্তাহের দিন, রুটের ক্ষমতা এবং পিক আওয়ার
data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.day_name()
capacity_map = {'R1':55, 'R2':50, 'R3':45, 'R4':40}
data['capacity'] = data['route'].map(capacity_map)
data['is_peak'] = data['hour'].isin([7,8,9,17,18,19]).astype(int)

In [6]:

data.loc[data['is_peak']==1, 'passenger_count'] += np.random.poisson(lam=10, size=data['is_peak'].sum())
data.loc[data['weather']=='Rainy', 'passenger_count'] += np.random.poisson(lam=3, size=(data['weather']=='Rainy').sum())

# লক্ষ্য ভেরিয়েবল: বাস overloaded কিনা
data['overloaded'] = (data['passenger_count'] > data['capacity']).astype(int)


In [7]:
out_path = 'bus_counter.csv'
data.to_csv(out_path, index=False)
print("✅ CSV file saved as:", out_path)
data.head()

✅ CSV file saved as: bus_counter.csv


Unnamed: 0,timestamp,route,stop_id,passenger_count,boardings,alightings,weather,is_holiday,hour,day_of_week,capacity,is_peak,overloaded
0,2025-01-01 00:00:00,R2,S7,30,2,2,Clear,0,0,Wednesday,50,0,0
1,2025-01-01 01:00:00,R4,S3,26,2,7,Rainy,0,1,Wednesday,40,0,0
2,2025-01-01 02:00:00,R3,S14,32,6,3,Clear,0,2,Wednesday,45,0,0
3,2025-01-01 03:00:00,R2,S10,20,1,4,Clear,0,3,Wednesday,50,0,0
4,2025-01-01 04:00:00,R1,S16,25,4,4,Clear,0,4,Wednesday,55,0,0


In [8]:
df = pd.read_csv('bus_counter.csv', parse_dates=['timestamp'])
print("Shape:", df.shape)
print("\nMissing values per column:\n", df.isna().sum())
print("\nTarget distribution:\n", df['overloaded'].value_counts(normalize=True))
df.head()


Shape: (2000, 13)

Missing values per column:
 timestamp          0
route              0
stop_id            0
passenger_count    0
boardings          0
alightings         0
weather            0
is_holiday         0
hour               0
day_of_week        0
capacity           0
is_peak            0
overloaded         0
dtype: int64

Target distribution:
 overloaded
0    0.9765
1    0.0235
Name: proportion, dtype: float64


Unnamed: 0,timestamp,route,stop_id,passenger_count,boardings,alightings,weather,is_holiday,hour,day_of_week,capacity,is_peak,overloaded
0,2025-01-01 00:00:00,R2,S7,30,2,2,Clear,0,0,Wednesday,50,0,0
1,2025-01-01 01:00:00,R4,S3,26,2,7,Rainy,0,1,Wednesday,40,0,0
2,2025-01-01 02:00:00,R3,S14,32,6,3,Clear,0,2,Wednesday,45,0,0
3,2025-01-01 03:00:00,R2,S10,20,1,4,Clear,0,3,Wednesday,50,0,0
4,2025-01-01 04:00:00,R1,S16,25,4,4,Clear,0,4,Wednesday,55,0,0


In [9]:

# ------------------ Step 5: Feature selection + train-test split ------------------
df_model = df[['route','hour','day_of_week','passenger_count','capacity','is_peak','weather','is_holiday','overloaded']].copy()

categorical_cols = ['route','day_of_week','weather']
numeric_cols = ['hour','passenger_count','capacity','is_peak','is_holiday']

X = df_model.drop(columns=['overloaded'])
y = df_model['overloaded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)


In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])

model_pipeline.fit(X_train, y_train)
print("✅ Model trained successfully!")


In [14]:
# ------------------ Step 7: Model Evaluation ------------------
y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


NameError: name 'model_pipeline' is not defined

In [15]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label='ROC curve')
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

NameError: name 'y_proba' is not defined

In [16]:
ohe= model_pipeline.named_steps['preprocessor'].named_transformers_['cat']
num_features = numeric_cols
cat_feature_names = list(ohe.get_feature_names_out(categorical_cols))
feature_names = num_features + cat_feature_names

coeffs = model_pipeline.named_steps['clf'].coef_[0]
feat_imp = pd.DataFrame({'feature': feature_names, 'coef': coeffs, 'abs_coef': np.abs(coeffs)}).sort_values(by='abs_coef', ascending=False)
feat_imp.head(20)


NameError: name 'model_pipeline' is not defined

In [17]:
print("\n# Conclusion")
print("- Logistic Regression model achieved high accuracy.")
print("- Most important features affecting bus overload:")
print("  - passenger_count")
print("  - route (R1–R4)")
print("  - is_peak (rush hour)")
print("  - capacity")
print("- Model can help transport managers predict overloaded buses in advance.")



# Conclusion
- Logistic Regression model achieved high accuracy.
- Most important features affecting bus overload:
  - passenger_count
  - route (R1–R4)
  - is_peak (rush hour)
  - capacity
- Model can help transport managers predict overloaded buses in advance.
