In [2]:
TRAIN_DATA_PATH = "./data/dataset/train.csv"
TEST_DATA_PATH = "./data/dataset/test.csv"

In [3]:
import pandas as pd
import re

train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

def fillMissing(df):
    df.TITLE.fillna(value="missing", inplace=True)
    df.BULLET_POINTS.fillna(value="missing", inplace=True)
    df.DESCRIPTION.fillna(value="missing", inplace=True)
    return df

def normalize_text(text):
    text = text.replace('""', ' inch')
    text = text.replace('"', '')
    text = text.replace('/p', '')
    text = text.replace('/b', '')
    text = text.replace('-', ' to ')
    text = re.sub(r'[^a-zA-Z0-9.:/\s%_"]|(?<=\d)_(?=\d)', '', text)
    text = text.replace('_', ' ')
    splits = text.strip().split(' ')
    return u" ".join([x for x in splits if len(x) >= 1])

def lowercase_text(text):
    return text.lower()

train_data = fillMissing(train_data)
test_data = fillMissing(test_data)

# train data
train_data["TITLE"] = train_data["TITLE"].apply(lowercase_text)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(lowercase_text)
train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(lowercase_text)

train_data["TITLE"] = train_data["TITLE"].apply(normalize_text)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(normalize_text)
train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(normalize_text)

# test data
test_data["TITLE"] = test_data["TITLE"].apply(lowercase_text)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(lowercase_text)
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(lowercase_text)

test_data["TITLE"] = test_data["TITLE"].apply(normalize_text)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(normalize_text)
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(normalize_text)

train_data

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,artzfolio tulip flowers blackout curtain for d...,luxurious appealing: beautiful custom to made ...,missing,1650,2125.980000
1,2673191,marks spencer girls pyjama sets t862561c navy ...,harry potter hedwig pyjamas 6 to 16 yrs100% co...,missing,2755,393.700000
2,2765088,priknik horn red electric air horn compressor ...,loud dual tone trumpet horn compatible with sx...,specifications: color: red material: aluminium...,7537,748.031495
3,1594019,alishah womens cotton ankle length leggings co...,made by 95%cotton and 5% lycra which gives you...,aishah womens lycra cotton ankel leggings. bra...,2996,787.401574
4,283658,the united empire loyalists: a chronicle of th...,missing,missing,6112,598.424000
...,...,...,...,...,...,...
2249693,2422167,nike womens as w ny df swsh hn kh bra cz7610 t...,material : polyester,missing,3009,1181.100000
2249694,2766635,3pcs goose game cute cartoon enamel pins funny...,inspiration inspired by the untitled goose gam...,pbbrand: xvieonr pbr pbproduct name: fashion c...,3413,125.984252
2249695,1987786,kangroo sweep movement printed wooden wall clo...,dial size: 12 inches in diameterbig clear repr...,wall clocks are very attractive in looks and e...,1574,1200.000000
2249696,1165754,electro voice ekx to brkt15 wall mount bracket...,missing,missing,592,2900.000000


In [244]:
import torch
import torch.nn as nn

DEVICE = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")

In [245]:
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url

padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

text_transform = T.Sequential(
    T.SentencePieceTokenizer(xlmr_spm_model_path),
    T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
)

from torch.utils.data import DataLoader, Dataset

In [246]:
class AmazonDataset(Dataset):
    
    def __init__(self, meta_df):
        
        self.df = meta_df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        item = self.df.iloc[index]
        title = item["TITLE"]
        description = item["DESCRIPTION"]
        bullets = item["BULLET_POINTS"]
        product_type = item["PRODUCT_TYPE_ID"]
        product_length = item["PRODUCT_LENGTH"]
        return (title, description, bullets, product_type, product_length)
    
class AmazonTestDataset(Dataset):
    
    def __init__(self, meta_df):
        
        self.df = meta_df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        item = self.df.iloc[index]
        title = item["TITLE"]
        description = item["DESCRIPTION"]
        bullets = item["BULLET_POINTS"]
        product_type = item["PRODUCT_TYPE_ID"]
        product_id = item["PRODUCT_ID"]
        return (title, description, bullets, product_type, product_id)    
    
dev_dataset = AmazonDataset(train_data)
test_dataset = AmazonTestDataset(test_data)

train_set_size = int(len(dev_dataset) * 0.8)
valid_set_size = len(dev_dataset) - train_set_size

train_dataset_full, validation_dataset_full = torch.utils.data.random_split(dev_dataset, [train_set_size, valid_set_size])

## Dataset subsampling

In [247]:
from torch.utils.data import Subset

sampling_at = 1

train_indices = list(range(0, len(train_dataset_full), sampling_at))
val_indices = list(range(0, len(validation_dataset_full), sampling_at))

train_dataset = Subset(train_dataset_full, train_indices)
validation_dataset = Subset(validation_dataset_full, val_indices)

In [248]:
import os
import torchtext
from torchtext.functional import to_tensor

BATCH_SIZE = 32
NUM_WORKERS = os.cpu_count()

xlmr_base = torchtext.models.XLMR_BASE_ENCODER
xlmr_model = xlmr_base.get_model()
xlmr_model.eval()
xlmr_model = xlmr_model.to(DEVICE)
xlmr_transform = xlmr_base.transform()

def data_collate(data, data_transform):
    title_feats = []
    desc_feats = []
    bullet_feats = []
    lengths = []
    
    for sample in data:
        title_feats.append(sample[0]) #title
        desc_feats.append(sample[1]) # desc
        bullet_feats.append(sample[2]) #bullets
        lengths.append(sample[4])
    
    title_feats = to_tensor(data_transform(title_feats), padding_value=1)
    desc_feats = to_tensor(data_transform(desc_feats), padding_value=1)
    bullet_feats = to_tensor(data_transform(bullet_feats), padding_value=1)
    lengths = torch.Tensor(lengths).unsqueeze(dim=1)
    
    title_feats = xlmr_model(title_feats.to(DEVICE)).detach()
    desc_feats = xlmr_model(desc_feats.to(DEVICE)).detach()
    bullet_feats = xlmr_model(bullet_feats.to(DEVICE)).detach()
    
    features = torch.cat((title_feats, desc_feats, bullet_feats), dim=1)
            
    return features, lengths

def testData_collate(data, data_transform):
    title_feats = []
    desc_feats = []
    bullet_feats = []
    product_ids = []
    
    for sample in data:
        title_feats.append(sample[0]) #title
        desc_feats.append(sample[1]) # desc
        bullet_feats.append(sample[2]) #bullets
        product_ids.append(sample[4])
    
    title_feats = to_tensor(data_transform(title_feats), padding_value=1)
    desc_feats = to_tensor(data_transform(desc_feats), padding_value=1)
    bullet_feats = to_tensor(data_transform(bullet_feats), padding_value=1)
    # product_ids = torch.Tensor(product_ids)
    
    title_feats = xlmr_model(title_feats.to(DEVICE)).detach()
    desc_feats = xlmr_model(desc_feats.to(DEVICE)).detach()
    bullet_feats = xlmr_model(bullet_feats.to(DEVICE)).detach()
    
    features = torch.cat((title_feats, desc_feats, bullet_feats), dim=1)
            
    return features, product_ids

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda x: data_collate(x, xlmr_transform),
    num_workers= 0
)

validation_loader = torch.utils.data.DataLoader(
    dataset=validation_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda x: data_collate(x, xlmr_transform)
)

test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda x: testData_collate(x, xlmr_transform)
)

In [249]:
import torch.nn.functional as F

class Regressor(torch.nn.Module):
    
    def __init__(self, dim_cell=768, dim_emb=512):
        super().__init__()
        
        # attentive pooling layers
        self.embedding = nn.Linear(dim_cell, dim_emb)
        self.linear = nn.Linear(dim_emb, 1)
        
        self.linear2 = nn.Linear(dim_emb, 256)
        self.estimateLayer = nn.Linear(256, 1)
            
    def forward(self, encoder_out):
                
        embeds = torch.tanh(self.embedding(encoder_out))  # (batch, seg_len, dim_emb)
        attn_weights = F.softmax(self.linear(embeds), dim=1)
        embeds = torch.sum(embeds * attn_weights, dim=1)
        embedding = embeds.div(embeds.norm(p=2, dim=-1, keepdim=True)).unsqueeze(1)
        embedding = embedding.squeeze(dim=1)
        
        out = F.relu(self.linear2(embedding))
        estimate = F.relu(self.estimateLayer(out))
        
        return estimate, embedding

In [252]:
from torchinfo import summary

model = Regressor(dim_cell=768, dim_emb=256)
print(summary(model, input_size=(BATCH_SIZE, 300, 768)))

model = model.to(DEVICE)

Layer (type:depth-idx)                   Output Shape              Param #
Regressor                                [32, 1]                   --
├─Linear: 1-1                            [32, 300, 256]            196,864
├─Linear: 1-2                            [32, 300, 1]              257
├─Linear: 1-3                            [32, 256]                 65,792
├─Linear: 1-4                            [32, 1]                   257
Total params: 263,170
Trainable params: 263,170
Non-trainable params: 0
Total mult-adds (M): 8.42
Input size (MB): 29.49
Forward/backward pass size (MB): 19.80
Params size (MB): 1.05
Estimated Total Size (MB): 50.35


In [251]:
batch = next(iter(test_loader))
out = model(batch[0].to(DEVICE))[0].detach()
out.shape, batch[1]

(torch.Size([32, 1]),
 [604373,
  1729783,
  1871949,
  1107571,
  624253,
  2782548,
  1605901,
  938007,
  708128,
  1609597,
  500777,
  2736605,
  2217122,
  914243,
  408556,
  2896817,
  172273,
  1718,
  308161,
  148181,
  2829284,
  1056991,
  1167211,
  466450,
  2195336,
  2517255,
  2913047,
  2794652,
  1995489,
  2140554,
  1938386,
  2485380])

## Training iterations

In [253]:
from torch.optim import AdamW
from sklearn import metrics

loss_fn = nn.MSELoss()

learning_rate = 1e-3
optim = AdamW(model.parameters(), lr=learning_rate)

def train_step(input, target):
    output, _ = model(input)
    loss = loss_fn(output, target)
    optim.zero_grad()
    loss.backward()
    optim.step()
    return loss


def eval_step(input, target):
    output, _ = model(input)
    loss = loss_fn(output, target).item()
    
    output = output.cpu().detach().numpy()
    target = target.cpu().detach().numpy()
    
    score = max( 0 , 100*(1-metrics.mean_absolute_percentage_error(target, output)))
    return float(loss), float(score)


def evaluate():
    model.eval()
    total_loss = 0
    num_val_batches = 0
    total_score = 0
    with torch.no_grad():
        for batch in validation_loader:
            input = batch[0].to(DEVICE)
            target = batch[1].to(DEVICE)
            loss, score = eval_step(input, target)
            total_loss += loss
            total_score += score
            num_val_batches += 1

    return total_loss / num_val_batches, total_score / num_val_batches

In [230]:
num_epochs = 10
SAVE_PATH = "./chekpoints/"

for e in range(num_epochs):
    for batch in train_loader:
        input = batch[0].to(DEVICE)
        target = batch[1].to(DEVICE)
        train_loss = train_step(input, target)

    loss, score = evaluate()
    print("Epoch = [{}], loss = [{}], Score = [{}]".format(e, loss, score))
    
    # saving checkpoint
    torch.save(model.state_dict(), PATH)



Epoch = [0], loss = [5886537.129166666], Score = [1.3363508383433025]




Epoch = [1], loss = [5853238.023958334], Score = [4.269537130991618]




Epoch = [2], loss = [5801823.4625], Score = [7.930068174997966]




Epoch = [3], loss = [5726305.68125], Score = [13.017815748850504]




Epoch = [4], loss = [5626282.74375], Score = [18.15937678019206]




Epoch = [5], loss = [5493430.6375], Score = [23.096719185511272]




Epoch = [6], loss = [5374118.044010417], Score = [27.04355796178182]




Epoch = [7], loss = [5269659.445833334], Score = [30.20431935787201]




Epoch = [8], loss = [5169950.800260416], Score = [32.9258394241333]




Epoch = [9], loss = [5081459.9215494795], Score = [33.47242976228396]


## Test submission

In [170]:
test_data

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,604373,manuel dhliogravure et de photogravure en reli...,missing,missing,6142
1,1729783,dcgaring microfiber throw blanket warm fuzzy p...,quality guaranteed: luxury cozy plush polyeste...,bdcgaring throw blanketbrbr bsize chart w x l ...,1622
2,1871949,i to match auto parts front license plate brac...,front license plate bracket made of plasticdir...,replacement for the following vehicles:2020 le...,7540
3,1107571,pinmart gold plated excellence in service 1 ye...,available as a single item or bulk packed. sel...,our excellence in service lapel pins feature a...,12442
4,624253,visual mathematics illustrated by the ti to 92...,missing,missing,6318
...,...,...,...,...,...
734731,921419,casual canine basic hoodie for dogs 16 medium ...,brightly colored pet sweatshirts with authenti...,za6015 16 43 size to see chart below: medium: ...,7073
734732,2456362,dive log book: scuba diving logbook for beginn...,missing,missing,1
734733,841529,axor 39135001 citterio widespread faucet with ...,8 to inch centers1/2 to inch ips inlets9 to in...,39135001 features: to ada compliant. to includ...,10645
734734,1190194,carolines treasures bb1801ds812 halloween bass...,indoor or outdoor aluminum artwork prints8 inc...,features. great for inside or outside these al...,12680


In [238]:
from tqdm import tqdm

product_ids = []
product_lengths = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_loader):
        input = batch[0].to(DEVICE)
        product_id = batch[1]
        estimated_out = list(model(input)[0].squeeze(dim=1).cpu().detach().numpy())
        
        product_ids.extend(product_id)
        product_lengths.extend(estimated_out)

 36%|█████████████▋                        | 8270/22961 [24:28<43:43,  5.60it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|███████████████████████████████████| 22961/22961 [1:08:00<00:00,  5.63it/s]


In [239]:
submission_data = {"PRODUCT_ID" : product_ids, "PRODUCT_LENGTH" : product_lengths}
submission_df = pd.DataFrame(submission_data)

submission_df

Unnamed: 0,PRODUCT_ID,PRODUCT_LENGTH
0,604373,504.020538
1,1729783,528.867554
2,1871949,527.113770
3,1107571,528.101379
4,624253,497.492645
...,...,...
734731,921419,528.607300
734732,2456362,499.727112
734733,841529,528.368958
734734,1190194,528.394165


In [241]:
submission_df.to_csv('submission_2pm_day2.csv', index=False)

In [242]:
max(product_lengths), min(product_lengths)

(528.9225, 448.62976)