In [54]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [76]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

## Setup Model

In [103]:
class BookingToSale(nn.Module):

  def __init__(self, input_dim, hidden_dim, num_classes=2):

    super(BookingToSale, self).__init__()

    self.fully_connected_1 = nn.Linear(input_dim, hidden_dim)
    self.batch_norm_1 = nn.BatchNorm1d(hidden_dim)

    self.fully_connected_2 = nn.Linear(hidden_dim, hidden_dim)
    self.batch_norm_2 = nn.BatchNorm1d(hidden_dim)

    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(0.3)

    self.regression = nn.Linear(hidden_dim, 1)
    self.classification = nn.Linear(hidden_dim, num_classes)


  def forward(self, x):

    x = self.relu(self.batch_norm_1(self.fully_connected_1(x)))

    x = self.relu(self.batch_norm_2(self.fully_connected_2(x)))

    x = self.dropout(x)

    reg_out = self.regression(x)
    class_out = self.classification(x)

    # print(reg_out.shape, class_out.shape)

    return reg_out.squeeze(1), class_out





## Load dataset

In [5]:
data = pd.read_csv('/content/booking_to_sale_dataset.csv')

In [6]:
data.head()

Unnamed: 0,age,booking_channel,region,is_repeat_customer,days_to_sale,delay_class
0,56,web,East,0,3,0
1,69,phone,South,0,68,2
2,46,web,West,1,10,0
3,32,web,North,1,11,0
4,60,web,West,0,8,0


In [9]:
cat_cols = data.select_dtypes(['object']).columns.tolist()

cat_cols



['booking_channel', 'region']

## Train-Val Split data

In [56]:
target = ['days_to_sale',	'delay_class']
X = data.drop(target, axis=1)
y = data[target]
X_train, X_val, y_train, y_val = train_test_split(X, y , test_size=0.2, stratify=y['delay_class'], random_state=42)

In [57]:
X_train

Unnamed: 0,age,booking_channel,region,is_repeat_customer
16683,48,web,North,0
9609,38,agent,South,0
56784,68,web,East,0
168123,28,web,North,0
197025,41,web,East,0
...,...,...,...,...
127036,56,web,North,0
44619,38,web,North,0
31717,35,agent,South,0
147413,64,agent,South,1


In [58]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_train_ohe = ohe.fit_transform(X_train[cat_cols])

X_val_ohe = ohe.transform(X_val[cat_cols])


In [59]:
def get_ohe_cols(ohe, cat_cols):

  ohe_cols = [ cat_cols[i] + '_' + cat for i,cat in enumerate(ohe.categories_)]

  ohe_cols_list = []
  for cols in ohe_cols:
    ohe_cols_list.extend(cols)
  return ohe_cols_list

In [60]:
ohe_cols = get_ohe_cols(ohe, cat_cols)


In [61]:
def append_ohe_features(X, X_ohe, cat_cols, ohe_cols):
  X = X.drop(cat_cols, axis=1)
  X[ohe_cols] = X_ohe
  return X

In [62]:
X_train = append_ohe_features(X_train, X_train_ohe,
                              cat_cols, ohe_cols)

X_val = append_ohe_features(X_val, X_val_ohe,
                              cat_cols, ohe_cols)

In [63]:
X_val.head()

Unnamed: 0,age,is_repeat_customer,booking_channel_agent,booking_channel_phone,booking_channel_web,region_East,region_North,region_South,region_West
3325,51,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
152515,61,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
101636,31,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
27302,52,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
32372,66,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [69]:
X_train.values

array([[48.,  0.,  0., ...,  1.,  0.,  0.],
       [38.,  0.,  1., ...,  0.,  1.,  0.],
       [68.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [35.,  0.,  1., ...,  0.,  1.,  0.],
       [64.,  1.,  1., ...,  0.,  1.,  0.],
       [22.,  1.,  0., ...,  0.,  0.,  0.]])

## Create Data loader and training loop

In [101]:
trainset = TensorDataset(

   torch.tensor(X_train.values).float(),
   torch.tensor(y_train['days_to_sale'].values).float(),
   torch.tensor(y_train['delay_class'].values).long()

)

valset = TensorDataset(

   torch.tensor(X_val.values).float(),
   torch.tensor(y_val['days_to_sale'].values).float(),
   torch.tensor(y_val['delay_class'].values).long()

)

train_loader = DataLoader(trainset, batch_size=512, shuffle=True)
val_loader = DataLoader(valset, batch_size=512)

In [93]:
def loss_func(reg_pred, reg_target, class_pred, class_target, class_weights):
    # Regression loss
    reg_loss = F.mse_loss(reg_pred, reg_target)

    # Classification loss (with class weights for imbalance)
    class_loss = F.cross_entropy(class_pred, class_target, weight=class_weights)

    # Combine (tune alpha to balance)
    total_loss = 0.5 * reg_loss + 0.5 * class_loss
    return total_loss

In [126]:
def train(model, train_loader, val_loader, class_weights,
          epochs=10, lr=1e-3, patience=5, eps = 0.0001, device='cpu'):

  model = model.to(device)

  optimizer = torch.optim.Adam(model.parameters(), lr=lr)

  class_weights = torch.tensor(class_weights).float().to(device)

  best_val_loss = float('inf')
  patience_counter = 0
  best_model_state = None

  for epoch in range(epochs):
    model.train()
    train_loss = 0

    for x_batch, y_reg_batch, y_class_batch in train_loader:
      x_batch = x_batch.to(device)
      y_reg_batch = y_reg_batch.to(device).float()
      y_class_batch = y_class_batch.to(device).long()

      optimizer.zero_grad()
      reg_out, class_out = model(x_batch)

      # reg_out = torch.round(reg_out) # round off num_days

      loss = loss_func(reg_out, y_reg_batch, class_out, y_class_batch, class_weights)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss / len(train_loader):.4f}")

    # compute validation loss
    if val_loader:
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x_val, y_reg_val, y_class_val in val_loader:
                x_val = x_val.to(device)
                y_reg_val = y_reg_val.to(device).float()
                y_class_val = y_class_val.to(device).long()

                reg_out, class_out = model(x_val)
                loss = loss_func(reg_out, y_reg_val, class_out, y_class_val, class_weights)
                val_loss += loss.item()

        avg_val_loss = val_loss/len(val_loader)
        print(f"Validation Loss: {val_loss / len(val_loader):.4f}")

        # Early stopping
        if best_val_loss - avg_val_loss > eps:
          best_val_loss = avg_val_loss
          patience_counter = 0
          best_model_state = model.state_dict()

        else:
          patience_counter += 1
          if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            if best_model_state: # if intermediate model state is better
                model.load_state_dict(best_model_state)
            break

  return model







## Train

In [71]:
y_train['delay_class'].value_counts()

Unnamed: 0_level_0,count
delay_class,Unnamed: 1_level_1
0,112000
1,40000
2,8000


In [80]:
class_weights = compute_class_weight('balanced', classes=y_train['delay_class'].unique(), y=y_train['delay_class'])

In [81]:
class_weights

array([0.47619048, 1.33333333, 6.66666667])

In [99]:
X_train.shape, X_val.shape

((160000, 9), (40000, 9))

In [134]:
## initialize model

model = BookingToSale(input_dim=X_train.shape[1], hidden_dim=128, num_classes=3)

In [135]:
%%time

train(model, train_loader, val_loader, class_weights,
            epochs=30, lr=1e-4, patience=5, eps=0.001, device='cpu')


Epoch 1/30, Training Loss: 169.6567
Validation Loss: 159.3545
Epoch 2/30, Training Loss: 148.0459
Validation Loss: 139.0150
Epoch 3/30, Training Loss: 129.2121
Validation Loss: 120.4097
Epoch 4/30, Training Loss: 113.6496
Validation Loss: 107.8988
Epoch 5/30, Training Loss: 102.6303
Validation Loss: 99.7935
Epoch 6/30, Training Loss: 96.4526
Validation Loss: 94.6462
Epoch 7/30, Training Loss: 93.6737
Validation Loss: 92.8308
Epoch 8/30, Training Loss: 92.7206
Validation Loss: 92.4952
Epoch 9/30, Training Loss: 92.4984
Validation Loss: 92.3945
Epoch 10/30, Training Loss: 92.4780
Validation Loss: 92.2896
Epoch 11/30, Training Loss: 92.3487
Validation Loss: 92.2790
Epoch 12/30, Training Loss: 92.3861
Validation Loss: 92.2492
Epoch 13/30, Training Loss: 92.4589
Validation Loss: 92.2740
Epoch 14/30, Training Loss: 92.3591
Validation Loss: 92.4005
Epoch 15/30, Training Loss: 92.4190
Validation Loss: 92.2643
Epoch 16/30, Training Loss: 92.3543
Validation Loss: 92.2472
Epoch 17/30, Training Lo

BookingToSale(
  (fully_connected_1): Linear(in_features=9, out_features=128, bias=True)
  (batch_norm_1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fully_connected_2): Linear(in_features=128, out_features=128, bias=True)
  (batch_norm_2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (regression): Linear(in_features=128, out_features=1, bias=True)
  (classification): Linear(in_features=128, out_features=3, bias=True)
)

In [139]:
test_idx = 100
x_test_sample = torch.tensor(X_val.values[test_idx]).float()
y_test_sample = y_val.iloc[test_idx]
y_pred_sample = model(x_test_sample.reshape(1,-1))

print(f'Test:{y_test_sample} | Pred: days_to_sale={y_pred_sample[0]}, delay_class = {torch.argmax(y_pred_sample[1], dim=1) }')

Test:days_to_sale    9
delay_class     0
Name: 40445, dtype: int64 | Pred: days_to_sale=tensor([13.1582], grad_fn=<SqueezeBackward1>), delay_class = tensor([2])


In [117]:
# y_val