In [1]:
# Traffic Volume Estimation — Model Training Notebook

# 📦 Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# 📂 Step 2: Load the Dataset
df = pd.read_csv('traffic volume.csv')
df.head()


Unnamed: 0,holiday,temp,rain,snow,weather,date,Time,traffic_volume
0,,288.28,0.0,0.0,Clouds,02-10-2012,09:00:00,5545
1,,289.36,0.0,0.0,Clouds,02-10-2012,10:00:00,4516
2,,289.58,0.0,0.0,Clouds,02-10-2012,11:00:00,4767
3,,290.13,0.0,0.0,Clouds,02-10-2012,12:00:00,5026
4,,291.14,0.0,0.0,Clouds,02-10-2012,13:00:00,4918


In [2]:
# 🛠 Step 3: Feature Engineering - Extract Hour and Day of Week
df['timestamp'] = pd.to_datetime(df['date'] + ' ' + df['Time'], format='%d-%m-%Y %H:%M:%S')
df['hour'] = df['timestamp'].dt.hour
df['dayofweek'] = df['timestamp'].dt.dayofweek

# Drop original datetime columns
df.drop(columns=['date', 'Time', 'timestamp'], inplace=True)
df.head()


Unnamed: 0,holiday,temp,rain,snow,weather,traffic_volume,hour,dayofweek
0,,288.28,0.0,0.0,Clouds,5545,9,1
1,,289.36,0.0,0.0,Clouds,4516,10,1
2,,289.58,0.0,0.0,Clouds,4767,11,1
3,,290.13,0.0,0.0,Clouds,5026,12,1
4,,291.14,0.0,0.0,Clouds,4918,13,1


In [3]:
# 🎯 Step 4: Define Features (X) and Target (y)
X = df.drop('traffic_volume', axis=1)
y = df['traffic_volume']

# Identify categorical and numerical features
cat_cols = ['holiday', 'weather']
num_cols = ['temp', 'rain', 'snow', 'hour', 'dayofweek']


In [4]:
# 🧹 Step 5: Preprocessing + Modeling Pipeline

# Preprocessing: Encode categoricals + Impute + Scale numerics
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), num_cols)
])

# Combine preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])


In [5]:
# 📊 Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# 🚂 Step 7: Train the Model
pipeline.fit(X_train, y_train)


In [7]:
# ✅ Step 8: Evaluate the Model
y_pred = pipeline.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


Mean Squared Error: 243731.72172355326
R2 Score: 0.9383506043216459


In [9]:
# 💾 Step 9: Save Preprocessing Components and Model for Flask

# Extract the fitted transformers from the pipeline
preprocessor = pipeline.named_steps['preprocessor']

# Get the column transformers
encoder = preprocessor.named_transformers_['cat']         # OneHotEncoder
num_pipeline = preprocessor.named_transformers_['num']    # Pipeline with imputer & scaler

# Get imputer and scaler from the numeric pipeline
imputer = num_pipeline.named_steps['imputer']
scaler = num_pipeline.named_steps['scaler']

# Get the trained model
model_only = pipeline.named_steps['model']

# Save each part for Flask app
joblib.dump(encoder, 'encoder.pkl')
joblib.dump(imputer, 'imputer.pkl')
joblib.dump(scaler, 'scale.pkl')
joblib.dump(model_only, 'model.pkl')


['model.pkl']