In [5]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:
df = pd.read_csv(r"D:\Swiggy_Time_prediction\Data\encoded.csv")

In [13]:
df.drop(columns=["Unnamed: 0","city_name"],inplace=True)

In [14]:
df.columns

Index(['age', 'ratings', 'restaurant_latitude', 'restaurant_longitude',
       'delivery_latitude', 'delivery_longitude', 'traffic',
       'vehicle_condition', 'multiple_deliveries', 'festival', 'city_type',
       'time_taken', 'order_day', 'order_month', 'order_day_of_week',
       'is_weekend', 'pickup_time_minutes', 'order_time_hour',
       'order_time_of_day', 'distance', 'weather_fog', 'weather_sandstorms',
       'weather_stormy', 'weather_sunny', 'weather_windy',
       'type_of_order_drinks', 'type_of_order_meal', 'type_of_order_snack',
       'type_of_vehicle_electric_scooter', 'type_of_vehicle_motorcycle',
       'type_of_vehicle_scooter'],
      dtype='str')

In [15]:


X = df.drop(columns=["time_taken"])
y = df["time_taken"]


In [16]:
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

numerical_features, categorical_features


(['age',
  'ratings',
  'restaurant_latitude',
  'restaurant_longitude',
  'delivery_latitude',
  'delivery_longitude',
  'traffic',
  'vehicle_condition',
  'multiple_deliveries',
  'festival',
  'city_type',
  'order_day',
  'order_month',
  'order_day_of_week',
  'is_weekend',
  'pickup_time_minutes',
  'order_time_hour',
  'order_time_of_day',
  'distance'],
 ['weather_fog',
  'weather_sandstorms',
  'weather_stormy',
  'weather_sunny',
  'weather_windy',
  'type_of_order_drinks',
  'type_of_order_meal',
  'type_of_order_snack',
  'type_of_vehicle_electric_scooter',
  'type_of_vehicle_motorcycle',
  'type_of_vehicle_scooter'])

In [17]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0.01)
vt.fit(X)

high_variance_features = X.columns[vt.get_support()].tolist()
low_variance_features = X.columns[~vt.get_support()].tolist()

high_variance_features, low_variance_features


(['age',
  'restaurant_latitude',
  'delivery_latitude',
  'traffic',
  'vehicle_condition',
  'multiple_deliveries',
  'festival',
  'city_type',
  'order_day',
  'order_month',
  'order_day_of_week',
  'is_weekend',
  'pickup_time_minutes',
  'order_time_hour',
  'order_time_of_day',
  'distance',
  'weather_fog',
  'weather_sandstorms',
  'weather_stormy',
  'weather_sunny',
  'weather_windy',
  'type_of_order_drinks',
  'type_of_order_meal',
  'type_of_order_snack',
  'type_of_vehicle_electric_scooter',
  'type_of_vehicle_motorcycle',
  'type_of_vehicle_scooter'],
 ['ratings', 'restaurant_longitude', 'delivery_longitude'])

In [None]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0.01)
vt.fit(X)

high_variance_features = X.columns[vt.get_support()].tolist()
low_variance_features = X.columns[~vt.get_support()].tolist()

high_variance_features, low_variance_features
['age',
  'restaurant_latitude',
  'delivery_latitude',
  'traffic',
  'vehicle_condition',
  'multiple_deliveries',
  'festival',
  'city_type',
  'order_day',
  'order_month',
  'order_day_of_week',
  'is_weekend',
  'pickup_time_minutes',
  'order_time_hour',
  'order_time_of_day',
  'distance',
  'weather_fog',
  'weather_sandstorms',
  'weather_stormy',
  'weather_sunny',
  'weather_windy',
  'type_of_order_drinks',
  'type_of_order_meal',
  'type_of_order_snack',
  'type_of_vehicle_electric_scooter',
  'type_of_vehicle_motorcycle',
  'type_of_vehicle_scooter'],
 ['ratings', 'restaurant_longitude', 'delivery_longitude']

In [18]:
from sklearn.feature_selection import mutual_info_regression
import pandas as pd

mi_scores = mutual_info_regression(X, y, random_state=42)

mi_df = pd.DataFrame({
    "feature": X.columns,
    "mutual_info": mi_scores
}).sort_values(by="mutual_info", ascending=False)

mi_df


Unnamed: 0,feature,mutual_info
1,ratings,0.155001
16,order_time_hour,0.129502
6,traffic,0.117512
8,multiple_deliveries,0.117286
18,distance,0.088129
11,order_day,0.083886
0,age,0.076505
4,delivery_latitude,0.063275
7,vehicle_condition,0.060347
17,order_time_of_day,0.050413


In [19]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_vif = X[numerical_features]

vif_df = pd.DataFrame()
vif_df["feature"] = X_vif.columns
vif_df["VIF"] = [
    variance_inflation_factor(X_vif.values, i)
    for i in range(X_vif.shape[1])
]

vif_df.sort_values(by="VIF", ascending=False)


Unnamed: 0,feature,VIF
5,delivery_longitude,434406400.0
3,restaurant_longitude,434180600.0
4,delivery_latitude,332944300.0
2,restaurant_latitude,330974300.0
18,distance,4646.27
1,ratings,89.05607
12,order_month,36.72848
16,order_time_hour,33.40808
0,age,29.26983
17,order_time_of_day,23.70224


In [20]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X, y)

importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": rf.feature_importances_
}).sort_values(by="importance", ascending=False)

importance_df


Unnamed: 0,feature,importance
1,ratings,0.19799
6,traffic,0.121789
8,multiple_deliveries,0.119466
18,distance,0.097773
0,age,0.095446
22,weather_sunny,0.068822
7,vehicle_condition,0.067327
19,weather_fog,0.031951
20,weather_sandstorms,0.018978
11,order_day,0.018017


In [22]:
from sklearn.inspection import permutation_importance
import pandas as pd

perm = permutation_importance(
    rf,
    X,
    y,
    n_repeats=10,
    random_state=42,
    n_jobs=1   # ðŸ”¥ FIX
)

perm_df = pd.DataFrame({
    "feature": X.columns,
    "perm_importance_mean": perm.importances_mean,
    "perm_importance_std": perm.importances_std
}).sort_values(by="perm_importance_mean", ascending=False)

perm_df



Unnamed: 0,feature,perm_importance_mean,perm_importance_std
6,traffic,0.457049,0.003922
18,distance,0.383883,0.002481
1,ratings,0.367948,0.003067
0,age,0.328231,0.001768
7,vehicle_condition,0.280892,0.002667
22,weather_sunny,0.280273,0.003172
8,multiple_deliveries,0.189307,0.002004
21,weather_stormy,0.099399,0.001297
23,weather_windy,0.097554,0.00116
20,weather_sandstorms,0.096097,0.000925
