In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model

In [14]:
df = pd.read_csv('data_n4.csv').drop(columns=['Unnamed: 0', 'period_dt'])
df['period'] = pd.to_datetime(df['period'])

In [15]:
df.head()

Unnamed: 0,start_pin,destination_pin,travel_distance,Quantity (In TON),period,nifty_infra_price,amount,distance_per_ton,route_frequency,avg_route_price,year,month,month_sin,month_cos
0,30,2349,1115,75.0,2023-11-01,6585.6,457500.0,14.866667,1,457500.0,2023,11,-0.5,0.866025
1,42,233,367,13.0,2023-10-01,6095.4,81453.13,28.230769,3,49967.87,2023,10,-0.866025,0.5
2,42,257,493,12.0,2023-11-01,6585.6,47120.62,41.083333,1,47120.62,2023,11,-0.5,0.866025
3,42,325,304,40.0,2023-10-01,6095.4,212500.0,7.6,4,226862.3775,2023,10,-0.866025,0.5
4,43,218,306,39.4,2023-10-01,6095.4,233938.0,7.766497,4,221755.25,2023,10,-0.866025,0.5


In [16]:
df['distance_per_ton'] = df['distance_per_ton'].replace([np.inf, -np.inf], np.nan).fillna(df['distance_per_ton'].mean())

In [17]:
df

Unnamed: 0,start_pin,destination_pin,travel_distance,Quantity (In TON),period,nifty_infra_price,amount,distance_per_ton,route_frequency,avg_route_price,year,month,month_sin,month_cos
0,30,2349,1115,75.00,2023-11-01,6585.60,457500.00,14.866667,1,4.575000e+05,2023,11,-5.000000e-01,0.866025
1,42,233,367,13.00,2023-10-01,6095.40,81453.13,28.230769,3,4.996787e+04,2023,10,-8.660254e-01,0.500000
2,42,257,493,12.00,2023-11-01,6585.60,47120.62,41.083333,1,4.712062e+04,2023,11,-5.000000e-01,0.866025
3,42,325,304,40.00,2023-10-01,6095.40,212500.00,7.600000,4,2.268624e+05,2023,10,-8.660254e-01,0.500000
4,43,218,306,39.40,2023-10-01,6095.40,233938.00,7.766497,4,2.217552e+05,2023,10,-8.660254e-01,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31077,442,2293,625,35.79,2023-06-01,5738.70,216417.84,17.462978,1,2.164178e+05,2023,6,1.224647e-16,-1.000000
31078,442,2319,653,29.75,2023-06-01,5738.70,172921.88,21.949580,1,1.729219e+05,2023,6,1.224647e-16,-1.000000
31079,449,1919,399,42.00,2023-07-01,6115.35,1023758.40,9.500000,1,1.023758e+06,2023,7,-5.000000e-01,-0.866025
31080,464,1877,399,30.00,2023-07-01,6115.35,131250.00,13.300000,3,1.526042e+05,2023,7,-5.000000e-01,-0.866025


In [20]:
num_features = ['travel_distance', 'Quantity (In TON)', 'nifty_infra_price',
                'distance_per_ton', 'route_frequency', 'avg_route_price',
                'year', 'month_sin', 'month_cos']
cat_features = ['start_pin', 'destination_pin']

In [21]:
X = df[num_features + cat_features]
y = df['amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=324)

In [22]:
X_train_cat = X_train.copy()
X_test_cat = X_test.copy()
for col in cat_features:
    X_train_cat[col] = X_train_cat[col].astype('category')
    X_test_cat[col] = X_test_cat[col].astype('category')

In [23]:
X_train_cat

Unnamed: 0,travel_distance,Quantity (In TON),nifty_infra_price,distance_per_ton,route_frequency,avg_route_price,year,month_sin,month_cos,start_pin,destination_pin
17007,353,29.81,8085.75,11.841664,34,2.969735e+05,2024,8.660254e-01,0.500000,668,840
18533,332,35.00,7303.40,9.485714,8,6.771220e+05,2023,-2.449294e-16,1.000000,42,372
993,422,2.44,6095.40,172.950820,1,7.808000e+03,2023,-8.660254e-01,0.500000,772,4933
27623,380,20.62,6115.35,18.428710,117,1.173631e+05,2023,-5.000000e-01,-0.866025,953,2622
18899,454,33.00,8085.75,13.757576,1,5.445000e+05,2024,8.660254e-01,0.500000,491,1900
...,...,...,...,...,...,...,...,...,...,...,...
600,316,6.50,6095.40,48.615385,4,5.988990e+05,2023,-8.660254e-01,0.500000,1283,6677
26107,493,43.00,6115.35,11.465116,2,9.139920e+05,2023,-5.000000e-01,-0.866025,358,1698
17292,462,35.46,8085.75,13.028765,8,1.640601e+05,2024,8.660254e-01,0.500000,945,4680
13044,381,39.46,5459.80,9.655347,9,2.274670e+05,2023,5.000000e-01,-0.866025,1029,5332


In [24]:
xgb_cat = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    enable_categorical=True,
    max_cat_to_onehot=5,
    n_estimators=1500,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [26]:
xgb_cat.fit(X_train_cat, y_train, eval_set=[(X_test_cat, y_test)], verbose=False)

In [27]:
preds_cat = xgb_cat.predict(X_test_cat)
print("Native Categorical Support Results:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, preds_cat)):.2f}")
print(f"R²: {r2_score(y_test, preds_cat):.4f}\n")

Native Categorical Support Results:
RMSE: 23229885.53
R²: 0.1275

