In [77]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [78]:
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder


In [79]:
# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/finaldata.csv')

In [80]:
# Step 2: Label encode the required columns

label_cols = ['data', 'route_type', 'is_cutoff']

# Apply label encoding
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [81]:
le_source = LabelEncoder()
le_dest = LabelEncoder()

df['source_center'] = le_source.fit_transform(df['source_center'])
df['destination_center'] = le_dest.fit_transform(df['destination_center'])

# Optional: see the mappings
print(dict(zip(le_source.classes_, le_source.transform(le_source.classes_))))
print(dict(zip(le_dest.classes_, le_dest.transform(le_dest.classes_))))


{'IND000000AAL': np.int64(0), 'IND000000AAQ': np.int64(1), 'IND000000AAS': np.int64(2), 'IND000000AAZ': np.int64(3), 'IND000000ABA': np.int64(4), 'IND000000ABD': np.int64(5), 'IND000000ABG': np.int64(6), 'IND000000ACA': np.int64(7), 'IND000000ACB': np.int64(8), 'IND000000ACK': np.int64(9), 'IND000000ACN': np.int64(10), 'IND000000ACT': np.int64(11), 'IND000000ADM': np.int64(12), 'IND000000ADV': np.int64(13), 'IND000000AEL': np.int64(14), 'IND000000AEM': np.int64(15), 'IND000000AET': np.int64(16), 'IND000000AFF': np.int64(17), 'IND000000AFG': np.int64(18), 'IND000000AFJ': np.int64(19), 'IND000000AFR': np.int64(20), 'IND000000AFT': np.int64(21), 'IND110014AAA': np.int64(22), 'IND110020AAB': np.int64(23), 'IND110024AAA': np.int64(24), 'IND110030AAD': np.int64(25), 'IND110035AAC': np.int64(26), 'IND110037AAK': np.int64(27), 'IND110037AAM': np.int64(28), 'IND110039AAA': np.int64(29), 'IND110043AAA': np.int64(30), 'IND110043AAC': np.int64(31), 'IND110044AAB': np.int64(32), 'IND110052AAA': np.

In [83]:
df.shape

(144316, 23)

In [84]:
# Separate features and target
X = df.drop(columns=['actual_time'])  # drop the target
y = df['actual_time']


In [85]:
# Filter train and test data based on 'data' column, then drop 'data' column
X_train = X[df['data'] == 1].drop(columns=['data'])
X_test = X[df['data'] == 0].drop(columns=['data'])
y_train = y[df['data'] == 1]
y_test = y[df['data'] == 0]




In [86]:
X_train.head()


Unnamed: 0,route_type,source_center,destination_center,start_scan_to_end_scan,is_cutoff,osrm_time,factor,segment_actual_time,segment_osrm_time,segment_factor,...,creation_weekday,creation_month,start_hour,start_weekday,start_month,actual_vs_osrm_time_ratio,segment_actual_vs_osrm_ratio,delivery_efficiency,avg_segment_actual_time_by_route,avg_segment_osrm_time_by_route
0,0,492,486,86.0,1,11.0,1.272727,14.0,11.0,1.272727,...,3,9,3,3,9,1.272612,1.272612,0.127905,24.507724,10.550283
1,0,492,486,86.0,1,20.0,1.2,10.0,9.0,1.111111,...,3,9,3,3,9,1.19994,1.110988,0.232555,24.507724,10.550283
2,0,492,486,86.0,1,28.0,1.428571,16.0,7.0,2.285714,...,3,9,3,3,9,1.42852,2.285388,0.325578,24.507724,10.550283
3,0,492,486,86.0,1,40.0,1.55,21.0,12.0,1.75,...,3,9,3,3,9,1.549961,1.749854,0.465111,24.507724,10.550283
4,0,492,486,86.0,0,44.0,1.545455,6.0,5.0,1.2,...,3,9,3,3,9,1.545419,1.19976,0.511622,24.507724,10.550283


In [87]:
# Normalize the data (important for neural networks)
scaler = StandardScaler()
# Select only numerical columns
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

In [88]:
# Apply scaling only on numerical columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_num)
X_test_scaled = scaler.transform(X_test_num)


In [89]:
# Step 1: Define candidate alpha values
alphas = [0.01, 0.1, 1.0, 10.0, 100.0]

In [90]:
# Step 3: Fit RidgeCV
ridge_cv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', cv=5)
ridge_cv.fit(X_train_scaled, y_train)

In [91]:
# Step 4: Get the best alpha
print("Best alpha:", ridge_cv.alpha_)

Best alpha: 0.01


In [92]:
# Step 5: Evaluate performance on test data
y_pred = ridge_cv.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)

print("Test MSE:", mse)
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")



Test MSE: 11767.591074739279
MAE: 52.80
RMSE: 108.48


In [95]:
# Compare predictions
comparison_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
})
comparison_df['Error'] = abs(comparison_df['Actual'] - comparison_df['Predicted'])
print(comparison_df.head())

   Actual  Predicted      Error
0    20.0 -34.029403  54.029403
1    40.0   8.942454  31.057546
2    51.0  21.077114  29.922886
3    41.0  -0.849718  41.849718
4    61.0  44.315332  16.684668


In [96]:
comparison_df['Relative_Error'] = abs(comparison_df['Predicted'] - comparison_df['Actual']) / comparison_df['Actual']
comparison_df['Correct'] = comparison_df['Relative_Error'] <= 0.30

accuracy = comparison_df['Correct'].mean() * 100
print(f"Relative Accuracy (±20% of actual): {accuracy:.2f}%")


Relative Accuracy (±20% of actual): 74.24%


In [97]:
# Mark as "Correct" if prediction is within ±20 units of actual value
threshold = 20  # You can change this
comparison_df['Correct'] = comparison_df['Error'] <= threshold

# Count how many are correct vs incorrect
print(comparison_df['Correct'].value_counts())


Correct
False    21077
True     18607
Name: count, dtype: int64
