In [38]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


class MLP(nn.Module):
    def __init__(self, config):
        super(MLP, self).__init__()
        input_dim = (config["num_id_features"] + config["num_dense_features"]) * config["embedding_dim"]
        hidden_dims = config["hidden_dims"]
        layers = [nn.Linear(input_dim, hidden_dims[0])]
        for i in range(len(hidden_dims) - 1):
            layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            layers.append(nn.ReLU())
        self.mlp = nn.Sequential(*layers)
        self.output_layer = nn.Linear(hidden_dims[-1], 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.mlp(x)
        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x
    

class EmbeddingInputLayer(nn.Module):
    def __init__(self, config):
        super(EmbeddingInputLayer, self).__init__()
        
        num_id_features = config["num_id_features"]
        id_spaces = config["id_spaces"]
        embedding_dim = config["embedding_dim"]
        self.embedding_layers = []
        for fea_name, id_space in id_spaces.items():
            embedding_layer = nn.Embedding(id_space+5, embedding_dim, padding_idx=None)
            self.embedding_layers.append(embedding_layer)


    def forward(self, input_ids):
        # input_ids_expand = torch.unsqueeze(input_tensor, axis=-1)
        
        embedded_outputs = []
        for i, embedding_layer in enumerate(self.embedding_layers):
            embedded_output = embedding_layer(input_ids[:, i])
            embedded_outputs.append(embedded_output)

        return torch.stack(embedded_outputs, dim=1)
    

class DenseEmbLayer(nn.Module):
    def __init__(self, config):
        super(DenseEmbLayer, self).__init__()
        
        embedding_dim = config["embedding_dim"]
        num_dense_features = config["num_dense_features"]
        self.dense_embedding_layers = []
        for i in range(num_dense_features):
            linear_layer = nn.Linear(1, embedding_dim)
            self.dense_embedding_layers.append(linear_layer)
                                    
    def forward(self, input_dense):
        
        embedded_outputs = []
        for i, embedding_layer in enumerate(self.dense_embedding_layers):
            embedded_output = embedding_layer(torch.unsqueeze(torch.unsqueeze(input_dense[:, i], axis=-1), axis=-1))
            embedded_outputs.append(embedded_output)

        return torch.concat(embedded_outputs, dim=1)
    

In [39]:
class DNNModel(torch.nn.Module):

    def __init__(self, config):
        super(DNNModel, self).__init__()

        self.emb_layer = EmbeddingInputLayer(config)
        self.dense_emb_layer = DenseEmbLayer(config)
        
        if config["backbone_model"] == "transformer":
            self.emb_dim_map = Linear(config["embedding_dim"], config["transformer_hidden_size"])
            self.backbone_model = nn.Transformer(d_model=config["transformer_hidden_size"])
        else:
            self.backbone_model = MLP(config)
        
        self.config = config

    def forward(self, input_ids, input_dense):
        try:
            sparse_emb = self.emb_layer(input_ids)
        except:
            print("input_ids", input_ids) 
            print(self.emb_layer.weights)
            raise ValueError("Input ids")
            
            
        dense_emb = self.dense_emb_layer(input_dense)
        
        total_emb = torch.concat([sparse_emb, dense_emb], axis=1)
        # print("total_emb:", total_emb.shape)
        
        # If we use simple mlp, flatten
        if self.config["backbone_model"] == "mlp":
            backbone_input_tensor = torch.flatten(total_emb, start_dim = 1)
            
        elif self.config["backbone_model"]  == "transformer":
            backbone_input_tensor = self.emb_dim_map(total_emb)
        
        output_tensor = self.backbone_model(backbone_input_tensor)
            
        return output_tensor

In [40]:
from torch.utils.data import Dataset, DataLoader


class PandasTrainDataset(Dataset):
    def __init__(self, data, config, is_train=True):
        self.data = data
        
        self.num_id_features = config["num_id_features"]
        self.num_dense_features = config["num_dense_features"]
        self.is_train = is_train
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        
        row = self.data.iloc[idx]
        # print(row)
        input_ids = torch.tensor([int(x) for x in row.iloc[:self.num_id_features]], dtype=torch.int32)
        input_numerical = torch.tensor([float(x) for x in row.iloc[self.num_id_features:self.num_id_features+self.num_dense_features]], dtype=torch.float)
        
        if self.is_train:
            target = torch.tensor(float(row.iloc[-1]), dtype=torch.float)
            return (input_ids, input_numerical), target
        else:
            return (input_ids, input_numerical)



# # Custom Dataset class for handling pandas categorical data
# class CsvTrainDataset(Dataset):
#     def __init__(self, filename, config, is_train=True):
#         # self.data = data
#         self.filename = filename
        
#         self.num_id_features = config["num_id_features"]
#         self.num_dense_features = config["num_dense_features"]
#         self.is_train = is_train
        
#         self.size = 0
        
#         with open(self.filename) as f:
#             for line in f:
#                 self.size += 1

#     def __len__(self):
#         return self.size

#     def __getitem__(self, idx):
#         with open(self.filename) as f:
#             for i in range(idx-1):
#                 f.readline()
#             row = f.readline().strip().split(',')
        
        
#         # row = self.data.iloc[idx]
#         # print(row)
#         input_ids = torch.tensor([int(x) for x in row[:self.num_id_features]], dtype=torch.int32)
#         input_numerical = torch.tensor([float(x) for x in row[self.num_id_features:self.num_id_features+self.num_dense_features]], dtype=torch.float)
        
#         if self.is_train:
#             target = torch.tensor(float(row[-1]), dtype=torch.float)
#             return (input_ids, input_numerical), target
#         else:
#             return (input_ids, input_numerical)



## Batch_size should be at least 256 to get a good result, and it needs about 16G~32G memory per worker in the ps-worker parallel mode.

In [48]:

id_features = ['unitDisplayType', 'brandName', 'bundleId',
       'appVersion', 'correctModelName', 'countryCode', 'deviceId',
       'osAndVersion', 'connectionType', 'c1', 'c2', 'c3', 'c4', 'size', 
        'mediationProviderVersion', 'bidFloorPrice']
model_config = {
    "num_id_features":  len(id_features),
    "num_dense_features": 1,
    "id_spaces": {
        'unitDisplayType': 3,
         'brandName': 144,
         'bundleId': 18,
         'appVersion': 101,
         'correctModelName': 2568,
         'countryCode': 168,
         'deviceId': 40176,
         'osAndVersion': 96,
         'connectionType': 3,
         'c1': 50,
         'c2': 9,
         'c3': 4,
         'c4': 9,
         'size': 6,
         'mediationProviderVersion': 35,
         'bidFloorPrice': 4
    },
    "embedding_dim": 8,
    "hidden_dims": [128, 32],
    "batch_size": 32,
    "test_batch_size": 64,
    "backbone_model": "mlp",
    
#     "backbone_model": "transformer",
#     "transformer_hidden_size": 256,
}

## The nrows limitation is because my laptop doesn't have enough memeory. 
## For large dataset, the stream loader should be used. Here I just use an easy way to make it run

In [44]:
import pandas as pd
import numpy as np
df_train = pd.read_csv("train_dataset.csv", header=None, nrows=1000000)
df_test = pd.read_csv("test_dataset.csv", header=None)

In [19]:
# from torch.utils.data import IterableDataset

# f_train = open("train_dataset.csv")
# train_dataset = IterableDataset.from_generator(f_train)

In [45]:
train_dataset = PandasTrainDataset(df_train, model_config, True)
train_dataloader = DataLoader(train_dataset, model_config["batch_size"], shuffle=True)

In [36]:
test_dataset = PandasTrainDataset(df_test, model_config, False)
test_dataloader = DataLoader(test_dataset, model_config["test_batch_size"], shuffle=False)

In [34]:
# import torch
# for i, ((input_ids, input_numerics), y) in enumerate(train_dataloader):
#     print(input_ids.shape)
#     print(input_numerics.shape)
#     print(y.shape)
#     if i == 100:
#         break
# for input_ids, input_numerics in test_dataloader:
#     print(input_ids.shape)
#     print(input_numerics.shape)
#     # print(y.shape)
#     break

## For Output, we can try to test some map function to utilize the fact: SucBid >= SentPrice
## for example SentPrice + K * Sigmoid(score) here. K is a maximum price restriction.

## The lr and weight_decay(L2 Normalization) should be tuned

In [75]:
model = DNNModel(model_config)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-3)

training_loader = train_dataloader
def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.
    
    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        (input_ids, input_dense), labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        sig = nn.Sigmoid()
        # print(input_dense[:,0].shape, torch.squeeze(model(input_ids, input_dense)).shape, labels.shape)
        outputs = input_dense[:,0] + 20 * sig(torch.squeeze(model(input_ids, input_dense))) 

        # Compute the loss and its gradients
        loss = loss_fn(torch.squeeze(outputs), labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        # if i % 1000 == 999:
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            # tb_x = epoch_index * len(training_loader) + i + 1
            # tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [76]:
train_one_epoch(1)

  batch 1000 loss: 477.61000794219973
  batch 2000 loss: 350.8128782539368
  batch 3000 loss: 334.9473261489868
  batch 4000 loss: 228.19869411087035
  batch 5000 loss: 415.52515966796875
  batch 6000 loss: 314.66726699066163
  batch 7000 loss: 458.8694084892273
  batch 8000 loss: 542.8520246429443
  batch 9000 loss: 491.32740203475953
  batch 10000 loss: 327.92970405960085
  batch 11000 loss: 267.48425094985964
  batch 12000 loss: 285.04578519058225
  batch 13000 loss: 341.48575658798217
  batch 14000 loss: 429.9701660423279
  batch 15000 loss: 422.77308324813845
  batch 16000 loss: 262.1191298904419
  batch 17000 loss: 263.84964530563354
  batch 18000 loss: 290.9429146537781
  batch 19000 loss: 313.31634185409547
  batch 20000 loss: 312.93155073165894
  batch 21000 loss: 334.437187297821
  batch 22000 loss: 490.98245753479006
  batch 23000 loss: 566.5439945259094
  batch 24000 loss: 473.8431470413208
  batch 25000 loss: 298.2583898925781
  batch 26000 loss: 440.8435456466675
  batch 

398.8134397659302