In [139]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [140]:
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder


In [141]:
# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/finaldata.csv')

In [142]:
df.head(3)

Unnamed: 0,data,route_type,source_center,destination_center,start_scan_to_end_scan,is_cutoff,actual_time,osrm_time,factor,segment_actual_time,...,creation_weekday,creation_month,start_hour,start_weekday,start_month,actual_vs_osrm_time_ratio,segment_actual_vs_osrm_ratio,delivery_efficiency,avg_segment_actual_time_by_route,avg_segment_osrm_time_by_route
0,training,Carting,IND388121AAA,IND388620AAB,86.0,True,14.0,11.0,1.272727,14.0,...,3,9,3,3,9,1.272612,1.272612,0.127905,24.507724,10.550283
1,training,Carting,IND388121AAA,IND388620AAB,86.0,True,24.0,20.0,1.2,10.0,...,3,9,3,3,9,1.19994,1.110988,0.232555,24.507724,10.550283
2,training,Carting,IND388121AAA,IND388620AAB,86.0,True,40.0,28.0,1.428571,16.0,...,3,9,3,3,9,1.42852,2.285388,0.325578,24.507724,10.550283


In [143]:
# Step 2: Label encode the required columns

label_cols = ['data', 'is_cutoff','route_type']

# Apply label encoding
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [144]:
#colum list
df.columns

Index(['data', 'route_type', 'source_center', 'destination_center',
       'start_scan_to_end_scan', 'is_cutoff', 'actual_time', 'osrm_time',
       'factor', 'segment_actual_time', 'segment_osrm_time', 'segment_factor',
       'creation_hour', 'creation_weekday', 'creation_month', 'start_hour',
       'start_weekday', 'start_month', 'actual_vs_osrm_time_ratio',
       'segment_actual_vs_osrm_ratio', 'delivery_efficiency',
       'avg_segment_actual_time_by_route', 'avg_segment_osrm_time_by_route'],
      dtype='object')

In [145]:
le_source = LabelEncoder()
le_dest = LabelEncoder()

df['source_center'] = le_source.fit_transform(df['source_center'])
df['destination_center'] = le_dest.fit_transform(df['destination_center'])



In [146]:
df.shape

(144316, 23)

In [147]:
df.head(3)

Unnamed: 0,data,route_type,source_center,destination_center,start_scan_to_end_scan,is_cutoff,actual_time,osrm_time,factor,segment_actual_time,...,creation_weekday,creation_month,start_hour,start_weekday,start_month,actual_vs_osrm_time_ratio,segment_actual_vs_osrm_ratio,delivery_efficiency,avg_segment_actual_time_by_route,avg_segment_osrm_time_by_route
0,1,0,492,486,86.0,1,14.0,11.0,1.272727,14.0,...,3,9,3,3,9,1.272612,1.272612,0.127905,24.507724,10.550283
1,1,0,492,486,86.0,1,24.0,20.0,1.2,10.0,...,3,9,3,3,9,1.19994,1.110988,0.232555,24.507724,10.550283
2,1,0,492,486,86.0,1,40.0,28.0,1.428571,16.0,...,3,9,3,3,9,1.42852,2.285388,0.325578,24.507724,10.550283


In [148]:
# Separate features and target
X = df.drop(columns=['actual_time'])  # drop the target
y = df['actual_time']


In [149]:
# Filter train and test data based on 'data' column, then drop 'data' column
X_train = X[df['data'] == 1].drop(columns=['data'])
X_test = X[df['data'] == 0].drop(columns=['data'])
y_train = y[df['data'] == 1]
y_test = y[df['data'] == 0]




In [150]:
# Select only numerical columns
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

In [151]:
from scipy.stats import zscore
import numpy as np

# Compute Z-scores
z_scores = np.abs(zscore(X_train_num))  # X_train_num: only numerical features

# Threshold typically used: 3
outliers = (z_scores > 3)

# Check rows with any outlier values
outlier_rows = np.where(outliers.any(axis=1))[0]
print("Outlier row indices:", outlier_rows)


Outlier row indices: [    34    117    123 ... 104594 104613 104631]


In [152]:
# Compute Z-scores
z_scores = np.abs(zscore(X_test_num))  # X_train_num: only numerical features

# Threshold typically used: 3
outliers = (z_scores > 3)

# Check rows with any outlier values
outlier_rows = np.where(outliers.any(axis=1))[0]
print("Outlier row indices:", outlier_rows)


Outlier row indices: [   34    39    47 ... 39681 39682 39683]


In [153]:
# Remove outliers from X and y
X_train_cleaned = X_train_num.drop(index=X_train_num.iloc[outlier_rows].index)
y_train_cleaned = y_train.drop(index=y_train.iloc[outlier_rows].index)

print(f"Removed {len(outlier_rows)} outliers")

Removed 2457 outliers


In [154]:
# Remove outliers from X and y
X_test_cleaned = X_test_num.drop(index=X_test_num.iloc[outlier_rows].index)
y_test_cleaned = y_test.drop(index=y_test.iloc[outlier_rows].index)

print(f"Removed {len(outlier_rows)} outliers")

Removed 2457 outliers


In [155]:
df.shape

(144316, 23)

In [156]:
# Apply scaling only on numerical columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_cleaned)
X_test_scaled = scaler.transform(X_test_cleaned)

# **Ensure Metric Consistency**

#  Standardize Evaluation Pipeline

In [162]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


def evaluate_model(name, y_true, y_pred, results=[]):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    results.append({
        'Model': name,
        'MSE': round(mse, 2),
        'RMSE': round(rmse, 2),
        'MAE': round(mae, 2),
        'R2': round(r2, 4)
    })

    return results


In [163]:
from sklearn.linear_model import LassoCV

lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train_scaled, y_train_cleaned)
lasso_pred = lasso.predict(X_test_scaled)

r2_lasso = lasso.score(X_test_scaled, y_test_cleaned)
print(f"Lasso R²: {r2_lasso:.4f}")


Lasso R²: 0.9712


In [164]:
# Make predictions
ridge_pred = ridge.predict(X_test_scaled)

# Evaluate the model
results = evaluate_model("Ridge", y_test_cleaned, ridge_pred)
results = evaluate_model("Lasso", y_test_cleaned, lasso_pred) # Removed as Lasso model is not trained yet

# **Model Enhancement Planning**

# Model Enhancement Planning

In [165]:
from sklearn.linear_model import RidgeCV
import numpy as np

# Define the alpha values (regularization strengths)
alphas = np.logspace(-4, 1, 50)

# Initialize RidgeCV with 10-fold cross-validation
ridge_cv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', cv=10)

# Fit model on scaled training data
ridge_cv.fit(X_train_scaled, y_train_cleaned)

# Best alpha from cross-validation
print("Best alpha for Ridge:", ridge_cv.alpha_)

# Predict on test set
ridge_pred = ridge_cv.predict(X_test_scaled)

# Evaluate and store results
results = evaluate_model("Ridge (CV Tuned)", y_test_cleaned, ridge_pred, results)



Best alpha for Ridge: 0.0001


In [166]:
from sklearn.linear_model import LassoCV
alphas = np.logspace(-4, 1, 50)
# LassoCV will automatically search for the best alpha
lasso_cv = LassoCV(alphas=alphas, cv=10, random_state=42)

# Fit the model to the training data
lasso_cv.fit(X_train_scaled, y_train_cleaned)

# Get the best alpha
print("Best alpha for Lasso:", lasso_cv.alpha_)

# Predict on the test set
lasso_pred = lasso_cv.predict(X_test_scaled)

# Evaluate and store the results
results = evaluate_model("Lasso (CV Tuned)", y_test_cleaned, lasso_pred, results)


Best alpha for Lasso: 0.5963623316594643


In [167]:
import numpy as np
from sklearn.linear_model import ElasticNetCV

# Define the parameter grid
alphas = np.logspace(-4, 1, 50)  # You can adjust the range based on your problem
l1_ratios = np.linspace(0.1, 0.9, 9)

# Initialize ElasticNetCV
elastic_cv = ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, cv=10, random_state=42)

# Fit the model
elastic_cv.fit(X_train_scaled, y_train_cleaned)

# Predictions
elastic_pred = elastic_cv.predict(X_test_scaled)

# Print best hyperparameters
print("Best alpha for ElasticNet:", elastic_cv.alpha_)
print("Best l1_ratio for ElasticNet:", elastic_cv.l1_ratio_)

# Evaluate and append to results
results = evaluate_model("ElasticNet (CV Tuned)", y_test_cleaned, elastic_pred, results)



Best alpha for ElasticNet: 0.005428675439323859
Best l1_ratio for ElasticNet: 0.9


In [168]:
results_df = pd.DataFrame(results)

results_df_cleaned = results_df.drop_duplicates()

print(results_df_cleaned)

                   Model      MSE   RMSE    MAE      R2
0                  Ridge  8337.40  91.31  46.01  0.9713
1                  Lasso  8367.80  91.48  45.35  0.9712
2       Ridge (CV Tuned)  8013.62  89.52  45.31  0.9724
3       Lasso (CV Tuned)  8360.85  91.44  45.48  0.9712
4  ElasticNet (CV Tuned)  8350.80  91.38  46.07  0.9712
