In [34]:
import numpy as np
import pickle
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [35]:
with open('./train.pickle', 'rb') as file:
    dataset = pickle.load(file)

In [36]:
data_by_size_dict = dict()
for qid, docs in dataset.items():
    labels = np.array([doc[0] for doc in docs])
    features = np.vstack([doc[1] for doc in docs])
    sort_ind = np.flip(np.argsort(labels))
    labels = labels[sort_ind].reshape(-1,1)
    if labels.max() == labels.min():
        continue
    features = features[sort_ind]
    if labels.shape[0] in data_by_size_dict.keys():
        data_by_size_dict[labels.shape[0]][0].append(features)
        data_by_size_dict[labels.shape[0]][1].append(labels)
    else:
        data_by_size_dict[labels.shape[0]] = ([features], [labels])
keys = list(data_by_size_dict.keys())
keys.sort()
data_by_size = []
for key in keys:
    value = data_by_size_dict[key]
    data_by_size.append((np.array(value[0]), np.array(value[1])))
del data_by_size_dict
del dataset

In [4]:
n_features = data_by_size[0][0].shape[2]

In [5]:
def tt_split(data_list, test_size):
    train_list = []
    test_list = []
    for data in data_list:
        if len(data[0]) < 10:
            test_list.append((torch.FloatTensor(data[0]).cuda(),
                              torch.LongTensor(data[1]).cuda()))
            continue
        X_train, X_test = train_test_split(data[0], test_size = test_size, train_size=1 - test_size)
        y_train, y_test = train_test_split(data[1], test_size = test_size, train_size=1 - test_size)
        train_list.append((torch.FloatTensor(X_train).cuda(),torch.LongTensor(y_train).cuda()))
        test_list.append((torch.FloatTensor(X_test).cuda(),torch.LongTensor(y_test).cuda()))
    return train_list, test_list

In [6]:
train_list, test_list = tt_split(data_by_size, test_size=0.2)

In [7]:
torch.cuda.memory_allocated()/2**20

935.1435546875

In [8]:
def nDCG(model, data_list):
    model = model.eval()
    log2 = torch.tensor(np.log(2)).cuda()
    acc_list = []
    for batch in data_list:   
        with torch.no_grad():
            features = batch[0]
            labels = batch[1]
            scores = model.forward(features)
            n_docs = labels.shape[1]
            order = 1 + torch.sort(torch.sort(scores, dim=1, descending=True)[1], dim=1)[1]
            true_order = torch.arange(1, labels.shape[1] + 1).reshape(-1,1)\
                .repeat(labels.shape[0],1,1).cuda()
            explabel = torch.exp(labels*log2)
            logorder = 1 / (torch.log(order.float()) + 1)
            DCGmax = ((explabel - 1)/(torch.log(true_order.float()) + 1))\
                .sum(dim=1, keepdim=True)
            DCG = ((explabel - 1)/(torch.log(order.float()) + 1)).sum(dim=1, keepdim=True)
            acc_list.append((DCG.shape[0], ((DCG+0.001)/(DCGmax+0.001)).sum().item()))
    acc = 0
    total_queries = 0
    for n_queries, nDCGsum in acc_list:
        total_queries += n_queries
        acc += nDCGsum
    return acc/total_queries

In [9]:
class Model(torch.nn.Module):
    def __init__(self, in_features, n_layers, layer_size):
        super().__init__()
        self.layers = nn.ModuleList([])
        layer_struct = [layer_size for i in range(n_layers)]
        layer_struct = [in_features] + layer_struct
        for i in range(1, n_layers + 1):
            self.layers.append(nn.Linear(layer_struct[i-1], layer_struct[i]))
        self.out_layer = nn.Linear(layer_struct[-1], 1)
        self.drop = nn.Dropout(p=0.5)
        
    def forward(self, x):
        y_pred = x
        for layer in self.layers:
            y_pred = layer(y_pred)
            y_pred = y_pred + torch.relu(y_pred)
            y_pred = self.drop(y_pred)
        y_pred = self.out_layer(y_pred)
        return y_pred

In [59]:
model = Model(n_features, 10, 500).cuda()
curr_test = 0

In [19]:
print(sum([len(batch[0]) for batch in train_list]))
print(sum([len(batch[0]) for batch in train_list[3:40]]))

14857
12086


In [67]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.000008)

log2 = torch.tensor(np.log(2)).cuda()
epochs = 10

optimizer.zero_grad()
for ep in range(epochs):
    for batch in train_list[3:40]:
        model.train()
        features = batch[0]
        labels = batch[1]
        scores = model.forward(features)
        n_docs = labels.shape[1]
        with torch.no_grad():
            order = 1 + torch.sort(torch.sort(scores, dim=1, descending=True)[1], dim=1)[1]
            true_order = torch.arange(1, labels.shape[1] + 1).reshape(-1,1)\
                .repeat(labels.shape[0],1,1).cuda()
            explabel = torch.exp(labels*log2)
            logorder = 1 / (torch.log(order.float()) + 1)
            DCGmax = ((explabel - 1)/(torch.log(true_order.float()) + 1)).sum(dim=1, keepdim=True)
            dNDCGabs = ((logorder.repeat(1,1,n_docs) - logorder.repeat(1,1,n_docs).transpose(1,2)) * \
                (explabel.repeat(1,1,n_docs) - explabel.repeat(1,1,n_docs).transpose(1,2))).abs()/(0.01 + DCGmax)
            dscores = scores.repeat(1,1,n_docs) - scores.repeat(1,1,n_docs).transpose(1,2)
            smat = torch.sign(labels.repeat(1,1,n_docs) - labels.repeat(1,1,n_docs).transpose(1,2))
            lambdaij = smat*torch.sigmoid(-smat*dscores)*dNDCGabs
            lambdai = lambdaij.sum(dim=1) - lambdaij.sum(dim=2)
        loss = (scores.view(-1,n_docs)*lambdai).sum()
        loss = loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    test_metric = nDCG(model, test_list)
    print(f'Train: {nDCG(model, train_list):.5f}, test: {test_metric:.5f}')
    if test_metric > curr_test:
        curr_test = test_metric
        torch.save(model.state_dict(), 'temp.pt')


Train: 0.82398, test: 0.81869
Train: 0.82394, test: 0.81889
Train: 0.82398, test: 0.81893
Train: 0.82410, test: 0.81901
Train: 0.82408, test: 0.81887
Train: 0.82399, test: 0.81895
Train: 0.82400, test: 0.81887
Train: 0.82395, test: 0.81896
Train: 0.82386, test: 0.81889
Train: 0.82396, test: 0.81889


In [68]:
nDCG(model, test_list)

0.8188916741338881

In [268]:
torch.save(model.state_dict(), '.pt')

In [311]:
model.load_state_dict(torch.load('temp.pt'))

<All keys matched successfully>

----------

In [276]:
with open('./test.pickle', 'rb') as file:
    testset = pickle.load(file)

In [312]:
log2 = np.log(2)
docs_processed = 0
model.eval()
filename = 'netsol_3.txt'
with open('./'+filename, 'w') as file:
    file.write('QueryId,DocumentId\n')
    for qid, docs in testset.items():
        X = torch.tensor(np.vstack(tuple((doc[1] for doc in docs)))).cuda().float()
        doc_ids = []
        for doc in docs:
            docs_processed += 1
            doc_ids.append(docs_processed)
        doc_ids = np.array(doc_ids).reshape(-1)
        scores = model(X).reshape(-1)
#         scores = torch.rand(len(doc_ids))
        ordered_index = torch.sort(scores, descending=True)[1]
        ordered_ids = doc_ids[ordered_index.cpu()].reshape(-1)
        for i in range(len(ordered_ids)):
            file.write(str(qid)+','+str(ordered_ids[i])+'\n')