In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense # type: ignore
from tensorflow.keras.models import Model #type: ignore
from sklearn.metrics import mean_squared_error, r2_score


In [4]:
df = pd.read_csv('data.csv')
df = df.drop(columns=['Unnamed: 0'])

### I am trying to add new features to the model

1. distance_per_ton - can be used to measure the efficiency of the route
2. route_frequency - define the popularity of a particular route
3. avg_route_price - average price of the route


In [5]:
df['distance_per_ton'] = df['travel_distance'] / df['Quantity (In TON)']
df['distance_per_ton'] = df['distance_per_ton'].replace([np.inf, -np.inf], np.nan)
df['distance_per_ton'] = df['distance_per_ton'].fillna(df['distance_per_ton'].mean())

In [6]:
route_frequency = df.groupby(['start_pin', 'destination_pin']).size().reset_index(name='frequency')
df = df.merge(route_frequency, on=['start_pin', 'destination_pin'], how='left')
df['frequency'] = df['frequency'].fillna(0) 

In [7]:
avg_route_price = df.groupby(['start_pin', 'destination_pin'])['amount'].mean().reset_index(name='avg_route_price')
df = df.merge(avg_route_price, on=['start_pin', 'destination_pin'], how='left')
df['avg_route_price'] = df['avg_route_price'].fillna(df['amount'].mean()) 

In [8]:
df.columns

Index(['start_pin', 'destination_pin', 'travel_distance', 'Quantity (In TON)',
       'amount', 'distance_per_ton', 'frequency', 'avg_route_price'],
      dtype='object')

In [9]:
df.head()

Unnamed: 0,start_pin,destination_pin,travel_distance,Quantity (In TON),amount,distance_per_ton,frequency,avg_route_price
0,110092,392001,1115,75.0,585600.0,14.866667,1,585600.0
1,124106,141015,367,13.0,104260.01,28.230769,3,63958.87
2,124106,143006,493,12.0,60314.4,41.083333,1,60314.4
3,124106,147001,304,40.0,272000.0,7.6,4,290383.8475
4,124146,140306,306,39.4,299440.5,7.766497,4,283846.6725


In [10]:
def check_infinite_values(df, columns):
    for col in columns:
        inf_count = np.isinf(df[col]).sum()
        nan_count = np.isnan(df[col]).sum()
        if inf_count > 0 or nan_count > 0:
            print(f"Column {col}: Inf count = {inf_count}, NaN count = {nan_count}")

In [11]:
le_start = LabelEncoder()
le_dest = LabelEncoder()
df['start_pin'] = le_start.fit_transform(df['start_pin'])
df['destination_pin'] = le_dest.fit_transform(df['destination_pin'])

In [12]:
df = df.rename(columns={'frequency': 'route_frequency'})

In [13]:
df.head()

Unnamed: 0,start_pin,destination_pin,travel_distance,Quantity (In TON),amount,distance_per_ton,route_frequency,avg_route_price
0,28,2338,1115,75.0,585600.0,14.866667,1,585600.0
1,40,231,367,13.0,104260.01,28.230769,3,63958.87
2,40,255,493,12.0,60314.4,41.083333,1,60314.4
3,40,323,304,40.0,272000.0,7.6,4,290383.8475
4,41,216,306,39.4,299440.5,7.766497,4,283846.6725


In [14]:
X = df.drop(columns=['amount'])
y = df['amount']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=324)

In [16]:
df

Unnamed: 0,start_pin,destination_pin,travel_distance,Quantity (In TON),amount,distance_per_ton,route_frequency,avg_route_price
0,28,2338,1115,75.00,585600.00,14.866667,1,5.856000e+05
1,40,231,367,13.00,104260.01,28.230769,3,6.395887e+04
2,40,255,493,12.00,60314.40,41.083333,1,6.031440e+04
3,40,323,304,40.00,272000.00,7.600000,4,2.903838e+05
4,41,216,306,39.40,299440.50,7.766497,4,2.838467e+05
...,...,...,...,...,...,...,...,...
30857,419,2282,625,35.79,277014.84,17.462978,1,2.770148e+05
30858,419,2308,653,29.75,221340.02,21.949580,1,2.213400e+05
30859,425,1910,399,42.00,1310410.72,9.500000,1,1.310411e+06
30860,439,1868,399,30.00,168000.00,13.300000,3,1.953333e+05


In [17]:
num_features = ['travel_distance', 'Quantity (In TON)', 'distance_per_ton', 'route_frequency', 'avg_route_price']
check_infinite_values(X_train, num_features)

In [18]:
for feature in num_features:
    X_train[feature] = X_train[feature].replace([np.inf, -np.inf], np.nan)
    X_test[feature] = X_test[feature].replace([np.inf, -np.inf], np.nan)
    
    feature_mean = X_train[feature].mean()
    X_train[feature] = X_train[feature].fillna(feature_mean)
    X_test[feature] = X_test[feature].fillna(feature_mean)

In [19]:
scaler = StandardScaler()
X_train[num_features] = scaler.fit_transform(X_train[num_features])
X_test[num_features] = scaler.transform(X_test[num_features])

In [20]:
check_infinite_values(X_train, num_features)

In [21]:
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

In [22]:
num_start_pins = df['start_pin'].nunique()
num_dest_pins = df['destination_pin'].nunique()
start_embed_dim = min(8, int(np.power(num_start_pins, 0.25)))
dest_embed_dim = min(16, int(np.power(num_dest_pins, 0.25)))

In [23]:
start_pin_input = Input(shape=(1,), name='start_pin')
dest_pin_input = Input(shape=(1,), name='destination_pin')
numerical_input = Input(shape=(5,), name='numerical_data') 

In [24]:
start_embedding = Embedding(input_dim=num_start_pins + 1, output_dim=start_embed_dim)(start_pin_input)
dest_embedding = Embedding(input_dim=num_dest_pins + 1, output_dim=dest_embed_dim)(dest_pin_input)

In [25]:
start_flat = Flatten()(start_embedding)
dest_flat = Flatten()(dest_embedding)

In [26]:
x = Concatenate()([start_flat, dest_flat, numerical_input])

In [27]:
x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.3)(x)

x = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.3)(x)

x = Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.2)(x)

output = Dense(1)(x)

In [28]:
# Create and compile model
model = Model(inputs=[start_pin_input, dest_pin_input, numerical_input], outputs=output)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mse')

# Early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

In [29]:
train_inputs = [
    X_train['start_pin'].values,
    X_train['destination_pin'].values,
    X_train[num_features].values
]

test_inputs = [
    X_test['start_pin'].values,
    X_test['destination_pin'].values,
    X_test[num_features].values
]

In [30]:
history = model.fit(
    train_inputs,
    y_train_scaled,
    validation_data=(test_inputs, y_test_scaled),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


In [31]:
prediction_train = model.predict(train_inputs)



In [32]:
predictions = model.predict(test_inputs)
predictions_original = y_scaler.inverse_transform(predictions)



In [33]:
predictions_original

array([[ 2245976.5],
       [ 1190771.8],
       [ 2417112.5],
       ...,
       [ 1627378.1],
       [-4213266.5],
       [ 1422090.8]], dtype=float32)

In [None]:
r22 = r2_score(y_train_scaled, prediction_train)
print("The r2 value of the training dataset is: ", r22)

The r2 value of the training dataset is: ${r22} 0.4129390314386717


In [37]:
mse = mean_squared_error(y_test_scaled, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_scaled, predictions)
print("MSE:", mse)
print("RMSE:", rmse)
print("R^2:", r2)

MSE: 0.582167894691411
RMSE: 0.7629992756821011
R^2: 0.5145338062880062
