In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor



In [79]:
df = pd.read_csv(
    "D:\\Swiggy_Time_prediction\\Data\\swiggy_demographic.csv"
)

X = df.drop("time_taken", axis=1)
y = df["time_taken"]


In [None]:


TARGET = "time_taken"



# Numeric features WITH log1p
numeric_log_cols = [
    "ratings",
    "order_time_hour"
]

# Numeric features WITHOUT log1p
numeric_cols = [
    "distance",
    "age",
    "vehicle_condition",
    "multiple_deliveries",
    "order_day",
    "pickup_time_minutes"
]

# Ordinal categorical features
ordinal_cols = ["traffic", "city_type", "festival"]

ordinal_categories = [
    ["low", "medium", "high", "jam"],          
    ["semi-urban", "urban", "metropolitian"],  
    ["no", "yes"]                             
]

# Nominal categorical feature
nominal_cols = ["weather"]



# Numeric pipeline with log1p
numeric_log_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(
        np.log1p,
        feature_names_out="one-to-one"
    ))
])

# Numeric pipeline without log1p
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# Ordinal encoding pipeline
ordinal_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(categories=ordinal_categories))
])

# Nominal encoding pipeline (fixed weather columns)
nominal_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(
        categories=[["sunny", "stormy", "windy", "sandstorms", "fog"]],
        handle_unknown="ignore",
        sparse_output=False
    ))
])


preprocessor = ColumnTransformer(
    transformers=[
        ("num_log", numeric_log_pipeline, numeric_log_cols),
        ("num", numeric_pipeline, numeric_cols),
        ("ord", ordinal_pipeline, ordinal_cols),
        ("nom", nominal_pipeline, nominal_cols)
    ],
    remainder="drop",
    verbose_feature_names_out=False
)


model = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)


final_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", model)
])


X = df.drop(columns=[TARGET])
y = df[TARGET]

final_pipeline.fit(X, y)


final_features = (
    final_pipeline
    .named_steps["preprocessing"]
    .get_feature_names_out()
)

print("Final features used by model:")
for f in final_features:
    print(f)


with open("delivery_time_pipeline67.pkl", "wb") as f:
    pickle.dump(final_pipeline, f)

print("✅ Fully corrected pipeline trained and saved successfully")


Final features used by model:
ratings
order_time_hour
distance
age
vehicle_condition
multiple_deliveries
order_day
pickup_time_minutes
traffic
city_type
festival
weather_sunny
weather_stormy
weather_windy
weather_sandstorms
weather_fog
✅ Fully corrected pipeline trained and saved successfully
