In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from tensorflow.keras.models import Model

In [14]:
data = pd.read_csv("data_n4.csv")

In [15]:
data['start_pin'] = data['start_pin'].astype('category').cat.codes
data['destination_pin'] = data['destination_pin'].astype('category').cat.codes

In [16]:
data.head()

Unnamed: 0.1,Unnamed: 0,start_pin,destination_pin,travel_distance,Quantity (In TON),period,nifty_infra_price,amount,distance_per_ton,route_frequency,avg_route_price,period_dt,year,month,month_sin,month_cos
0,0,30,2349,1115,75.0,November 2023,6585.6,457500.0,14.866667,1,457500.0,2023-11-01,2023,11,-0.5,0.866025
1,1,42,233,367,13.0,October 2023,6095.4,81453.13,28.230769,3,49967.87,2023-10-01,2023,10,-0.866025,0.5
2,2,42,257,493,12.0,November 2023,6585.6,47120.62,41.083333,1,47120.62,2023-11-01,2023,11,-0.5,0.866025
3,3,42,325,304,40.0,October 2023,6095.4,212500.0,7.6,4,226862.3775,2023-10-01,2023,10,-0.866025,0.5
4,4,43,218,306,39.4,October 2023,6095.4,233938.0,7.766497,4,221755.25,2023-10-01,2023,10,-0.866025,0.5


In [5]:


# Convert 'start_pin' and 'destination_pin' to pandas categorical type
data['start_pin'] = data['start_pin'].astype('category')
data['destination_pin'] = data['destination_pin'].astype('category')

# Define your feature columns (ensure to include the categorical ones)
features = ['start_pin', 'destination_pin', 'travel_distance', 'Quantity (In TON)','nifty_infra_price', 'avg_route_price', 'distance_per_ton', 'route_frequency','month_sin', 'month_cos']

In [7]:
data[features]

Unnamed: 0,start_pin,destination_pin,travel_distance,Quantity (In TON),nifty_infra_price,avg_route_price,distance_per_ton,route_frequency,month_sin,month_cos
0,30,2349,1115,75.00,6585.60,4.575000e+05,14.866667,1,-5.000000e-01,0.866025
1,42,233,367,13.00,6095.40,4.996787e+04,28.230769,3,-8.660254e-01,0.500000
2,42,257,493,12.00,6585.60,4.712062e+04,41.083333,1,-5.000000e-01,0.866025
3,42,325,304,40.00,6095.40,2.268624e+05,7.600000,4,-8.660254e-01,0.500000
4,43,218,306,39.40,6095.40,2.217552e+05,7.766497,4,-8.660254e-01,0.500000
...,...,...,...,...,...,...,...,...,...,...
31077,442,2293,625,35.79,5738.70,2.164178e+05,17.462978,1,1.224647e-16,-1.000000
31078,442,2319,653,29.75,5738.70,1.729219e+05,21.949580,1,1.224647e-16,-1.000000
31079,449,1919,399,42.00,6115.35,1.023758e+06,9.500000,1,-5.000000e-01,-0.866025
31080,464,1877,399,30.00,6115.35,1.526042e+05,13.300000,3,-5.000000e-01,-0.866025


In [None]:
features = [
    'travel_distance', 'Quantity (In TON)', 'nifty_infra_price',
    'distance_per_ton', 'route_count', 'month_sin', 'month_cos'
]
target = 'amount'

# Time-based split (critical for realistic validation)
train = data[data['period'] < '2023-07-01']
test = data[data['period'] >= '2023-07-01']

X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# When using xgb.DMatrix, you enable categorical support:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest  = xgb.DMatrix(X_test, enable_categorical=True)

In [10]:
params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',             
    'enable_categorical': True,
    'random_state': 42
}

In [11]:
model = xgb.train(params, dtrain, num_boost_round=100)

Parameters: { "enable_categorical" } are not used.



In [12]:
y_pred = model.predict(dtest)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', rmse)

RMSE: 9365798.052142503


In [17]:
# Time-based split
train = data[data['period'] < '2023-07-01']
test = data[data['period'] >= '2023-07-01']

# Feature engineering with time-aware validation
def create_features(df, train_set=None):
    # Route frequency (calculate from training data only)
    if train_set is not None:
        route_freq = train_set.groupby(['start_pin', 'destination_pin']).size().reset_index(name='route_freq')
        df = df.merge(route_freq, on=['start_pin', 'destination_pin'], how='left')
        df['route_freq'] = df['route_freq'].fillna(0)
    else:
        df['route_freq'] = 0  # Default for unseen routes
    
    # Other features
    df['distance_per_ton'] = df['travel_distance'] / (df['Quantity (In TON)'] + 1e-6)
    df['month'] = pd.to_datetime(df['period']).dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
    return df

train = create_features(train)
test = create_features(test, train_set=train)

In [18]:
# Feature columns
num_features = ['travel_distance', 'Quantity (In TON)', 'nifty_infra_price',
                'distance_per_ton', 'route_freq', 'month_sin', 'month_cos']
cat_features = ['start_pin', 'destination_pin']
all_features = num_features + cat_features

# Split data
X_train, y_train = train[all_features], train['amount']
X_test, y_test = test[all_features], test['amount']

In [19]:
# Convert categorical features to proper category type
X_train_native = X_train.copy()
X_test_native = X_test.copy()
for col in cat_features:
    X_train_native[col] = X_train_native[col].astype('category')
    X_test_native[col] = X_test_native[col].astype('category')

# Scale numerical features
scaler = StandardScaler()
X_train_native[num_features] = scaler.fit_transform(X_train_native[num_features])
X_test_native[num_features] = scaler.transform(X_test_native[num_features])

# Build and train XGBoost model
xgb_native = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    enable_categorical=True,
    max_cat_to_onehot=5,
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

xgb_native.fit(X_train_native, y_train,
               eval_set=[(X_test_native, y_test)],
               early_stopping_rounds=50,
               verbose=False)

# Evaluate
preds_native = xgb_native.predict(X_test_native)
print("Native Categorical Support Results:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, preds_native)):.2f}")
print(f"R²: {r2_score(y_test, preds_native):.4f}\n")

ValueError: Found array with 0 sample(s) (shape=(0, 7)) while a minimum of 1 is required by StandardScaler.