# Train transport_model.ipynb




In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib
import os

# Paths
BUS_PATH = "PMPL dataset.xlsx"
METRO_PATH = "pune_metro.xlsx"
ROUTES_PATH = "Pune Routes Cleaned.csv"

os.makedirs("model", exist_ok=True)

# Load datasets
bus_df = pd.read_excel(BUS_PATH, sheet_name="Sheet1")
metro_df = pd.read_excel(METRO_PATH, sheet_name="pune_metro_full_schedule_with_c")
routes_df = pd.read_csv(ROUTES_PATH)

print("Loaded: bus:", bus_df.shape, "metro:", metro_df.shape, "routes:", routes_df.shape)


Loaded: bus: (21814, 12) metro: (211, 13) routes: (531, 9)


## Improved Labeling


In [2]:
# Make labels
df = routes_df.copy()

# Ensure columns
for col in ['Station_Type','Station_Distance_km','Fare_INR','Travel_Time_min']:
    if col not in df.columns:
        df[col] = np.nan

# Label function with eco-bias
def label_row(r):
    dist = float(r['Station_Distance_km']) if pd.notna(r['Station_Distance_km']) else 999
    st = str(r.get('Station_Type', 'Bus'))
    if dist <= 0.5:
        return 'Walk'
    if 'Metro' in st and 'Bus' in st:
        return 'Mixed'
    if 'Metro' in st:
        return 'Metro'  # Prefer metro for eco
    return 'Bus'

df['Recommended_Mode'] = df.apply(label_row, axis=1)
print(df['Recommended_Mode'].value_counts())


Recommended_Mode
Bus      324
Metro    158
Walk      49
Name: count, dtype: int64


In [3]:
# Feature engineering: Add eco-feature (CO2 savings g/km, heuristic: metro=high, bus=low)
features = ['Station_Distance_km','Distance_from_End_km','Fare_INR','Travel_Time_min']
df['CO2_Savings_g'] = np.where(df['Station_Type']=='Metro', 50, np.where(df['Station_Type']=='Bus', 20, 0))
features.append('CO2_Savings_g')

# Fill missing
for f in features:
    df[f] = pd.to_numeric(df[f], errors='coerce').fillna(df[f].median())

X = df[features]
y = df['Recommended_Mode']

# Encode
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.4, random_state=42, stratify=y_enc)

# Train with regularization and class weights
model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Evaluate with CV
cv_scores = cross_val_score(model, X, y_enc, cv=5)
print("CV Scores:", cv_scores.mean())
print("Train score:", model.score(X_train, y_train))
print("Test score :", model.score(X_test, y_test))

# Save
joblib.dump(model, "model_samp/transport_model.pkl")
joblib.dump(le, "model_samp/label_encoder.pkl")
print("Saved model and encoder")


CV Scores: 1.0
Train score: 1.0
Test score : 1.0
Saved model and encoder


## Evaluation

In [4]:
# Predict and save sample
preds = model.predict(X_test)
preds_labels = le.inverse_transform(preds)
res = X_test.copy()
res['True'] = le.inverse_transform(y_test)
res['Pred'] = preds_labels
res.to_csv("model_samp/predictions_sample.csv", index=False)
print('Saved sample predictions')


Saved sample predictions


In [5]:
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

Train accuracy: 1.0
Test accuracy: 1.0
