### imports
***

In [1]:
import torch, math
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch import optim
from torch.optim.lr_scheduler import LambdaLR

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel

from gensim import corpora, similarities, models
from gensim.matutils import corpus2csc

import pandas as pd 
import numpy as np
from collections import Counter
import base64
from tqdm import tqdm
import pickle, random
import matplotlib.pyplot as plt
from multiprocessing import Pool
import sys, csv, json, os, gc, time

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### parameters
***

In [2]:
no = '0'
device = torch.device('cuda:'+no) if torch.cuda.is_available() else torch.device('cpu')
print(device)

k = 10
lr = 1e-5
# true batch size = batch_size * grad_step
batch_size = 32
margin = 8
grad_step = 2
max_lang_len = 15
max_img_len = 30
epochs = 1
MOD = 20000
seed = 7328
random.seed(seed)
torch.manual_seed(seed)
fold_idxs = [i for i in range(20)]
random.shuffle(fold_idxs)

cuda:0


In [3]:
class params:
    IMG_FEAT_SIZE = 2048+6
    WORD_EMBED_SIZE = 1024
    LAYER = 6
    HIDDEN_SIZE = 1024
    MULTI_HEAD = 8
    DROPOUT_R = 0.1
    FLAT_MLP_SIZE = 512
    FLAT_GLIMPSES = 1
    FLAT_OUT_SIZE = 2048
    FF_SIZE = HIDDEN_SIZE*4
    HIDDEN_SIZE_HEAD = HIDDEN_SIZE // MULTI_HEAD
    OPT_BETAS = (0.9, 0.98)
    OPT_EPS = 1e-9
    TRAIN_SIZE = 3000000

__C = params()

### load data
***

In [4]:
trash = {'!', '$', "'ll", "'s", ',', '&', ':', 'and', 'cut', 'is', 'are', 'was'}
trash_replace = ['"hey siri, play some', 'however, ', 'yin and yang, ',
                 'shopping mall/']

def process(x):
    tmp = x.split()
    if tmp[0] in trash: x = ' '.join(tmp[1:])
    if tmp[0][0] == '-': x = x[1:]
    for tr in trash_replace:
        x = x.replace(tr, '')
    return x

def normalize(x):
    ret = x['boxes'].copy()
    ret[:,0] /= x['image_h']
    ret[:,1] /= x['image_w']
    ret[:,2] /= x['image_h']
    ret[:,3] /= x['image_w']
    wh = (ret[:,2]-ret[:,0]) * (ret[:,3]-ret[:,1])
    wh2 = (ret[:,2]-ret[:,0]) / (ret[:,3]-ret[:,1]+1e-6)
    ret = np.hstack((ret, wh.reshape(-1,1), wh2.reshape(-1,1)))
    return ret

def sort_by_area(x):
    return np.array(sorted(x.tolist(), key=lambda x: x[-1], reverse=True))

def load_data(file_name, reset=False, decode=True):
    ret = pd.read_csv(file_name, sep='\t')
    if decode:
        ret['boxes'] = ret['boxes'].apply(lambda x: np.frombuffer(base64.b64decode(x), dtype=np.float32).reshape(-1, 4))
        ret['features'] = ret['features'].apply(lambda x: np.frombuffer(base64.b64decode(x), dtype=np.float32).reshape(-1, 2048))
        ret['class_labels'] = ret['class_labels'].apply(lambda x: np.frombuffer(base64.b64decode(x), dtype=np.int64).reshape(-1, 1))
        ret['boxes'] = ret.apply(lambda x: normalize(x), axis=1)
        ret['features'] = ret.apply(lambda x: np.concatenate((x['features'], x['boxes']), axis=1)[:max_img_len], axis=1)
    ret['query'] = ret['query'].apply(lambda x: process(x))
    # reset query_id
    if reset:
        query2qid = {query: qid for qid, (query, _) in enumerate(tqdm(ret.groupby('query')))}
        ret['query_id'] = ret.apply(lambda x: query2qid[x['query']], axis=1)
    return ret

In [5]:
path = './'
test = load_data(path+'valid.tsv')
testA = load_data(path+'testB.tsv')
answers = json.loads(open(path+'valid_answer.json', 'r').read())

### preprocess
***

In [6]:
# load pre-trained model
take = 'bert-large-uncased'
emb_size = __C.WORD_EMBED_SIZE
tokenizer = AutoTokenizer.from_pretrained(take)
pretrained_emb = AutoModel.from_pretrained(take)
pad_id = tokenizer.pad_token_id
sep_id = tokenizer.sep_token_id

qid2token = {qid: tokenizer.encode(group['query'].values[0]) for qid, group in tqdm(test.groupby('query_id'))}
test['token'] = test['query_id'].apply(lambda x: qid2token[x])
test['token'] = test['token'].apply(lambda x: x[:max_lang_len-1]+[sep_id] if len(x) > max_lang_len else x)
qid2token = {qid: tokenizer.encode(group['query'].values[0]) for qid, group in tqdm(testA.groupby('query_id'))}
testA['token'] = testA['query_id'].apply(lambda x: qid2token[x])
testA['token'] = testA['token'].apply(lambda x: x[:max_lang_len-1]+[sep_id] if len(x) > max_lang_len else x)

100%|██████████| 496/496 [00:00<00:00, 2808.17it/s]
100%|██████████| 994/994 [00:00<00:00, 2862.14it/s]


### training data
***

In [7]:
def one(x):
    return (x+1e-10)/(x+1e-10)

def get_negs(train):
    qid2idxs = {}
    corpus = []
    idx2qid = {}

    for idx, (qid, group) in enumerate(train.groupby('query_id')):
        qid2idxs[qid] = group.index
        corpus.append(group['query'].values[0])
        idx2qid[idx] = qid

    topk = len(max(qid2idxs.values(), key=lambda x: len(x)))*3
    corpus = [sent.split() for sent in corpus]
    dictionary = corpora.Dictionary(corpus)
    corpus = [dictionary.doc2bow(text) for text in corpus]
    tfidf_model = models.TfidfModel(corpus, wlocal=one, dictionary=dictionary)
    corpus_tfidf = corpus2csc(tfidf_model[corpus])
    sm = corpus_tfidf.T.dot(corpus_tfidf)
    qid2negs = {}
    
    for idx, (le, ri) in enumerate(tqdm(zip(sm.indptr[:-1], sm.indptr[1:]), total=sm.shape[0])):
        n_row_pick = min(topk, ri-le)
        top_n_idx = sm.indices[le+np.argpartition(sm.data[le:ri], -n_row_pick)[-n_row_pick:]].tolist()
        if n_row_pick < topk: top_n_idx += random.sample(range(sm.shape[0]), topk-n_row_pick)
        qid2negs[idx2qid[idx]] = [idx2qid[neg] for neg in top_n_idx if neg != idx]
        
    return qid2negs, qid2idxs

In [8]:
def get_data(train, qid2negs, qid2idxs):
    train_x = [] # [tokens, feature1, feature2]
    
    for qid, group in tqdm(train.groupby('query_id')):
        # positive
        pos = group.index
        token = group['token'].iloc[0]
        # negative
        for i in range(k):
            neg = random.sample(qid2negs[qid], len(pos))
            neg = [random.choice(qid2idxs[n]) for n in neg]
            train_x += [[token, pos[i], neg[i]] for i in range(len(pos))]
    
    print('number of training data:', len(train_x))
    return train_x

### model
***

In [9]:
class FC(nn.Module):
    def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):
        super(FC, self).__init__()
        self.dropout_r = dropout_r
        self.use_relu = use_relu

        self.linear = nn.Linear(in_size, out_size)
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(dropout_r)

    def forward(self, x):
        x = self.linear(x)
        x = self.relu(x)
        x = self.dropout(x)
        return x


class MLP(nn.Module):
    def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):
        super(MLP, self).__init__()

        self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)
        self.linear = nn.Linear(mid_size, out_size)

    def forward(self, x):
        return self.linear(self.fc(x))


class VisualBERT(nn.Module):
    def __init__(self, __C, bert):
        super(VisualBERT, self).__init__()
        
        self.bert = bert
        self.linear_img = nn.Linear(__C.IMG_FEAT_SIZE, __C.WORD_EMBED_SIZE)
        self.out = MLP(__C.WORD_EMBED_SIZE, __C.WORD_EMBED_SIZE//2, 1)
        
    def forward(self, ques_ix, img_feats):
        proj_feats = []
        for img_feat in img_feats:
            # Make mask & token type ids
            mask = self.make_mask(ques_ix.unsqueeze(2), img_feat, pad_id)
            token = self.get_token_type(ques_ix, img_feat)
            # Preprocess features
            lang_feat = self.bert.embeddings.word_embeddings(ques_ix)
            img_feat = self.linear_img(img_feat)
            combine_feat = torch.cat((lang_feat, img_feat), dim=1)
            # Token embeddings & position embeddings
            position_ids = torch.arange(token.size(1), dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(token.size())
            position_embeddings = self.bert.embeddings.position_embeddings(position_ids)
            token_type_embeddings = self.bert.embeddings.token_type_embeddings(token)
            # Add all
            embeddings = combine_feat+position_embeddings+token_type_embeddings
            embeddings = self.bert.embeddings.dropout(self.bert.embeddings.LayerNorm(embeddings))
            # Go through the rest of BERT
            head_mask = self.bert.get_head_mask(None, self.bert.config.num_hidden_layers)
            extended_attention_mask = self.bert.get_extended_attention_mask(mask, mask.size(), device)
            encoder_outputs = self.bert.encoder(embeddings,
                                                attention_mask=extended_attention_mask,
                                                head_mask=head_mask,
                                                encoder_hidden_states=None,
                                                encoder_attention_mask=None)
            # CLS embedding & output value
            outputs = encoder_outputs[0][:,0,:]
            proj_feats.append(self.out(outputs))
        return proj_feats
            
    # Masking
    def make_mask(self, lang_feat, img_feat, target):
        # 1 for NOT masked; 0 for masked
        # [batch, len]
        lang_mask = (torch.sum(torch.abs(lang_feat), dim=-1) != target).long()
        img_mask = (torch.sum(torch.abs(img_feat), dim=-1) != 0).long()
        return torch.cat((lang_mask, img_mask), dim=1)
    
    # Token type ids
    def get_token_type(self, lang_feat, img_feat):
        #    lang      img
        # 0 0 0 0 0 0 1 1 1 1 1
        lang_token = torch.zeros(lang_feat.size(0), lang_feat.size(1)).to(device)
        img_token = torch.ones(img_feat.size(0), img_feat.size(1)).to(device)
        return torch.cat((lang_token, img_token), dim=1).long()

### train
***

In [10]:
def predict(model):
    model = model.eval()
    preds = {}
    
    with torch.no_grad():
        for qid, group in tqdm(test.groupby('query_id')):
            # prepare batch
            tokens, features = group['token'].values.tolist(), group['features'].values.tolist()
            max_len_f = len(max(features, key=lambda x: len(x)))
            features = [np.concatenate((feature, np.zeros((max_len_f-feature.shape[0], feature.shape[1]))), axis=0) for feature in features]
            # # to tensor
            tokens = torch.LongTensor(tokens).to(device)
            features = torch.FloatTensor(features).to(device)
            # predict
            out = model(tokens, (features,))[0].view(-1)
            pred = [(pid, val) for pid, val in zip(group['product_id'].values.tolist(), out.tolist())]
            pred.sort(key=lambda x: x[1], reverse=True)
            preds[qid] = [pid for pid, _ in pred[:5]]
            
    model = model.train()
    return preds

In [11]:
class CustomDataset(data.Dataset):
    def __init__(self, train_x):
        self.train_x = train_x
        
    def __getitem__(self, index):
        tokens, pos_features, neg_features = self.train_x[index][0], self.train_x[index][1], self.train_x[index][2]
        return [tokens, pos_features, neg_features]
    
    def __len__(self):
        return len(self.train_x)
    
def collate_fn(batch):
    tokens, pos_features, neg_features = zip(*batch)
    max_len_t, max_len_pf, max_len_nf = len(max(tokens, key=lambda x: len(x))), len(max(pos_features, key=lambda x: len(x))), len(max(neg_features, key=lambda x: len(x)))
    tokens, pos_features, neg_features = [token+[pad_id]*(max_len_t-len(token)) for token in tokens], [np.concatenate((feature, np.zeros((max_len_pf-feature.shape[0], feature.shape[1]))), axis=0) for feature in pos_features], [np.concatenate((feature, np.zeros((max_len_nf-feature.shape[0], feature.shape[1]))), axis=0) for feature in neg_features]
    return torch.LongTensor(tokens), torch.FloatTensor(pos_features), torch.FloatTensor(neg_features)

def custom_schedule(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, amplitude=0.1, last_epoch=-1):
    
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = 2.0 * math.pi * float(num_cycles) * float(current_step-num_warmup_steps) / float(max(1, num_training_steps-num_warmup_steps))
        linear = float(num_training_steps-current_step) / float(max(1, num_training_steps-num_warmup_steps))
        return abs(linear + math.sin(progress)*linear*amplitude)

    return LambdaLR(optimizer, lr_lambda, last_epoch)

def shuffle(x):
    idxs = [i for i in range(x.shape[0])]
    random.shuffle(idxs)
    return x[idxs]

def nDCG_score(preds, answers):
    iDCG = sum([sum([np.log(2)/np.log(i+2) for i in range(min(len(answer), 5))]) \
                for answer in list(answers.values())])
    DCG = sum([sum([np.log(2)/np.log(i+2) if preds[qid][i] in answers[str(qid)] else 0 \
                    for i in range(len(preds[qid]))]) for qid in list(preds.keys())])
    return DCG/iDCG

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, logits=False, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce

    def forward(self, inputs, targets):
        if self.logits:
            BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduce=False)
        else:
            BCE_loss = F.binary_cross_entropy(inputs, targets, reduce=False)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [12]:
def get_data_merge(idx):
#     train = load_data(path+'train.sample.tsv', reset=True)
    train = load_data(path+'data/train_{}.tsv'.format(idx), reset=True, decode=False)
    qid2token = {qid: tokenizer.encode(group['query'].values[0]) for qid, group in train.groupby('query_id')}
    train['token'] = train['query_id'].apply(lambda x: qid2token[x])
    qid2negs, qid2idxs = get_negs(train)
    pickle.dump(get_data(train, qid2negs, qid2idxs),
                open(path+'data/train_x_{}_{}.pkl'.format(idx, seed), 'wb'))
    return None

def load_features(idx):
#     train = load_data(path+'train.sample.tsv', reset=True)
    train = load_data(path+'data/train_{}.tsv'.format(idx), reset=True)
    train_x = pickle.load(open(path+'data/train_x_{}_{}.pkl'.format(idx, seed), 'rb'))
    train_x = [[t, train['features'].iloc[p], train['features'].iloc[n]] \
               for t, p, n in tqdm(train_x)]
    return train_x

def get_data_chunk(start, end):
    ret = []
    for fold in range(start, end):
        t0 = time.time()
        print('reading fold:', fold)
        ret += load_features(fold_idxs[fold])
        t = round((time.time()-t0))
        print('time consumed: {} min {} sec'.format(t//60, t%60))
    return ret

In [13]:
# t0 = time.time()
# with Pool(5) as pool:
#     pool.map(get_data_merge, [i for i in range(20)])
# t = round((time.time()-t0)/60)
# print('time consumed: {} hr {} min'.format(t//60, t%60))

In [None]:
print('initializing model...')
nDCGs = []
best_nDCG = 0.0
model = VisualBERT(__C, pretrained_emb).to(device)
num_training_steps = np.ceil(__C.TRAIN_SIZE*k / (batch_size*grad_step)) * epochs
num_warmup_steps = int(num_training_steps*0.1)
eval_steps = num_training_steps//40*grad_step + 100
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
scheduler = custom_schedule(optimizer,
                            num_warmup_steps=num_warmup_steps,
                            num_training_steps=num_training_steps,
                            num_cycles=6,
                            amplitude=0.3)

for chunk in range(7):
    train_x_all = get_data_chunk(chunk*3, min((chunk+1)*3, 20))
    random.shuffle(train_x_all)
    steps = (min((chunk+1)*3, 20) - chunk*3) * 2
    step_size = len(train_x_all)//steps
    
    for step in range(steps):
        train_loader = data.DataLoader(CustomDataset(train_x_all[step_size*step:step_size*(step+1)]),
                                       batch_size=batch_size,
                                       shuffle=True,
                                       collate_fn=collate_fn,
                                       num_workers=2)
        print('train all size:', len(train_loader.dataset))

        print('start training!')
        model = model.train()
        criterion = FocalLoss()
        total_loss = 0.0
        step = len(train_loader)
        optimizer.zero_grad()
        pbar = tqdm(enumerate(train_loader), total=step)

        for i, batch in pbar:
            # prepare batch
            tokens, pos_features, neg_features = batch
            # # to device
            tokens = tokens.to(device)
            pos_features = pos_features.to(device)
            neg_features = neg_features.to(device)
            # predict
            pos, neg = model(tokens, (pos_features, neg_features))
            pos = torch.sigmoid(pos).view(-1)
            neg = torch.sigmoid(neg).view(-1)
            l = criterion(pos, torch.ones(pos.size()).to(device))
            l.backward()
            total_loss += l.item()
            l = criterion(neg, torch.zeros(neg.size()).to(device))
            l.backward()
            total_loss += l.item()
            pbar.set_postfix({'loss': total_loss/(i+1)})
            # optim step
            if (i+1)%grad_step == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            # evaluate
            if (i+1)%eval_steps == 0:
                preds = predict(model)
                nDCG = nDCG_score(preds, answers)
                nDCGs.append(nDCG)
                pickle.dump(nDCGs, open('log_nDCG_Visual-BERT_pair_box_tfidf-neg_focal_all', 'wb'))
                print('nDCG@5:', nDCG)
                # save models
                if len(nDCGs) > 30:
                    print('saving model...')
                    torch.save(model.state_dict(), path+'models/model_Visual-BERT_pair_box_tfidf-neg_focal_all_{}_{}'.format(seed, len(nDCGs)-1))
            # print loss
            if i%MOD == 0: print('Loss:{}'.format(total_loss/(i+1)))

        if step%grad_step:
            optimizer.step()
            scheduler.step()

        # evaluation
        preds = predict(model)
        nDCG = nDCG_score(preds, answers)
        nDCGs.append(nDCG)
        pickle.dump(nDCGs, open('log_nDCG_Visual-BERT_pair_box_tfidf-neg_focal_all', 'wb'))
        print('nDCG@5:', nDCG)
        # delete garbage
        del train_loader
        gc.collect()
        # save models
        if len(nDCGs) > 30:
            print('saving model...')
            torch.save(model.state_dict(), path+'models/model_Visual-BERT_pair_box_tfidf-neg_focal_all_{}_{}'.format(seed, len(nDCGs)-1))
            
    # delete garbage
    del train_x_all
    gc.collect()

initializing model...
reading fold: 0


100%|██████████| 114132/114132 [00:12<00:00, 8869.85it/s]
100%|██████████| 1500000/1500000 [00:32<00:00, 46699.55it/s]


time consumed: 2 min 46 sec
reading fold: 1


100%|██████████| 114554/114554 [00:13<00:00, 8736.17it/s]
100%|██████████| 1500000/1500000 [00:32<00:00, 46183.66it/s]


time consumed: 2 min 44 sec
reading fold: 2


100%|██████████| 114314/114314 [00:13<00:00, 8775.52it/s]
100%|██████████| 1500000/1500000 [00:31<00:00, 47437.66it/s]


time consumed: 2 min 45 sec
train all size: 750000
start training!


  0%|          | 1/23438 [00:01<7:05:31,  1.09s/it, loss=0.347]

Loss:0.3467542231082916


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)
 12%|█▏        | 2824/23438 [14:59<2:03:43,  2.78it/s, loss=0.348]

In [None]:
model

In [1]:
nDCGs

NameError: name 'nDCGs' is not defined

### prediction
***

In [None]:
def predict(model, test, pad_len):
    model.eval()
    counts = Counter(test['product_id'].values.tolist())
    preds = {}
    
    with torch.no_grad():
        for qid, group in tqdm(test.groupby('query_id')):
            # prepare batch
            tokens, features = group['token'].values.tolist(), group['features'].values.tolist()
            max_len_f = len(max(features, key=lambda x: len(x)))
            features = [np.concatenate((feature, np.zeros((max_len_f-feature.shape[0], feature.shape[1]))), axis=0) for feature in features]
            # # to tensor
            tokens = torch.LongTensor(tokens).to(device)
            features = torch.FloatTensor(features).to(device)
            # predict
            out = model(tokens, (features,))[0].view(-1)
            pred = [(pid, val) for pid, val in zip(group['product_id'].values.tolist(), out.tolist())]
            pred.sort(key=lambda x: x[1], reverse=True)
            assert len(pred) <= pad_len
            pid, score = [p for p, s in pred], [s for p, s in pred]
            pid, score = pid+[np.nan]*(pad_len-len(pred)), score+[np.nan]*(pad_len-len(pred))
            preds[qid] = pid+score
            
    return preds

In [None]:
folds = [i for i in range(30, 40)]
pad_len = 30

for fold in folds:
    print('seed: {}; fold: {}'.format(seed, fold))
    # load model weights
    try: model.load_state_dict(torch.load(path+'models/model_Visual-BERT_pair_box_tfidf-neg_focal_all_{}_{}'.format(seed, fold), map_location=device))
    except: continue
    # predict
    preds = predict(model, testA, pad_len)
    # write to file
    header = ['qid'] + ['p'+str(i) for i in range(pad_len)] + ['s'+str(i) for i in range(pad_len)]
    with open('predictions/prediction_all_{}_{}.csv'.format(seed, fold), 'w', newline='') as f:
        w = csv.writer(f)
        w.writerow(header)
        for qid in sorted(list(preds.keys())):
            w.writerow([qid]+preds[qid])