# Hands-On Learning to Rank (LTR)


### Include required packages

In [6]:
from __future__ import print_function
import sys
import os
import os.path
import csv
import re
import math
import random
import datetime
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

def print_message(s):
    print("[{}] {}".format(datetime.datetime.utcnow().strftime("%b %d, %H:%M:%S"), s), flush=True)

### Define train and test data readers 

In [7]:
class DataUtils:

    @staticmethod
    def parse_line(line):
        tokens                              = line.strip().split(' ')
        qid                                 = -1
        feat                                = []
        label                               = int(tokens[0])
        for i in range(FEAT_COUNT):
            feat.append(0)
        for i in range(1, len(tokens)):
            sub_tokens                      = tokens[i].split(':')
            if sub_tokens[0] == 'qid':
                qid                         = int(sub_tokens[1])
            else:
                feat_idx                    = int(sub_tokens[0])
                feat_val                    = float(sub_tokens[1])
                feat[feat_idx - 1]          = int(feat_val * FEAT_SCALE)
        return qid, label, feat
    
    
class DataReaderTrain():

    def __init__(self, data_file):
        self.data_file                      = data_file
        self.__load_data(self.data_file)

    def __iter__(self):
        self.__allocate_minibatch()
        return self

    def __load_data(self, data_file):
        self.data                           = {}
        with open(data_file, mode='r', encoding="utf-8") as f:
            for line in f:
                qid, label, feat            = DataUtils.parse_line(line)
                if qid not in self.data:
                    self.data[qid]          = {}
                if label not in self.data[qid]:
                    self.data[qid][label]   = []
                self.data[qid][label].append(feat)
        self.data                           = {k: v for k, v in self.data.items() if len(v) > 1}
        self.qids                           = list(self.data.keys())
    
    def __allocate_minibatch(self):
        self.features                       = [np.zeros((MB_SIZE, FEAT_COUNT), dtype=np.float32) for i in range(2)]
        self.labels                         = np.zeros((MB_SIZE), dtype=np.int64)
        
    def __clear_minibatch(self):
        for i in range(2):
            self.features[i].fill(np.float32(0))
            
    def __next__(self):
        self.__clear_minibatch()
        qids                                = random.sample(self.qids, MB_SIZE)
        for i in range(MB_SIZE):
            labels                          = random.sample(self.data[qids[i]].keys(), 2)
            labels.sort(reverse=True)
            for j in range(2):
                feats                       = self.data[qids[i]][labels[j]]
                feat                        = feats[random.randint(0, len(feats) - 1)]
                for k in range(FEAT_COUNT):
                    self.features[j][i, k]  = feat[k] / FEAT_SCALE
        return [torch.from_numpy(self.features[i]).to(DEVICE) for i in range(2)], torch.from_numpy(self.labels).to(DEVICE)
    
    
class DataReaderTest():

    def __init__(self, data_file):
        self.data_file                      = data_file

    def __iter__(self):
        self.reader                         = open(self.data_file, mode='r', encoding="utf-8")
        self.__allocate_minibatch()
        return self
    
    def __allocate_minibatch(self):
        self.features                       = np.zeros((MB_SIZE, FEAT_COUNT), dtype=np.float32)
        self.labels                         = np.zeros((MB_SIZE), dtype=np.int64)
        
    def __clear_minibatch(self):
        self.features.fill(np.float32(0))
            
    def __next__(self):
        self.__clear_minibatch()
        qids                                = []
        labels                              = []
        cnt                                 = 0
        for i in range(MB_SIZE):
            line                            = self.reader.readline()
            if line == '':
                raise StopIteration
                break
            qid, label, feat                = DataUtils.parse_line(line)
            qids.append(qid)
            labels.append(label)
            for j in range(FEAT_COUNT):
                self.features[i, j]         = feat[j] / FEAT_SCALE
            cnt                            += 1
        return torch.from_numpy(self.features).to(DEVICE), qids, labels, cnt

### Define the model

In [8]:
class DNN(torch.nn.Module):
    
    def __init__(self):
        super(DNN, self).__init__()
        layers              = []
        last_dim            = FEAT_COUNT
        for i in range(NUM_HIDDEN_LAYERS):
            layers.append(nn.Linear(last_dim, NUM_HIDDEN_NODES))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm(NUM_HIDDEN_NODES))
            layers.append(nn.Dropout(p=DROPOUT_RATE))
            last_dim        = NUM_HIDDEN_NODES
        layers.append(nn.Linear(last_dim, 1))
        layers.append(nn.ReLU())
        self.model          = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x) * SCALE
    
    def parameter_count(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

### Define data paths and readers

In [10]:
print_message('Starting')
DATA_DIR                    = 'letor/'
DATA_FILE_TRAIN             = os.path.join(DATA_DIR, 'train-small.txt')
DATA_FILE_TEST              = os.path.join(DATA_DIR, 'test.txt')
MODEL_FILE                  = os.path.join(DATA_DIR, "ltr.{}.dnn")
FEAT_COUNT                  = 136
FEAT_SCALE                  = 1000
MB_SIZE                     = 1024
READER_TRAIN                = DataReaderTrain(DATA_FILE_TRAIN)
READER_TRAIN_ITER           = iter(READER_TRAIN)
print_message('Data loaded')

[Jan 27, 07:57:17] Starting
[Jan 27, 07:57:51] Data loaded


### Train and evaluate

In [None]:
#DEVICE                      = torch.device("cuda:1")
DEVICE                      = torch.device("cpu")
NUM_HIDDEN_NODES            = 128
NUM_HIDDEN_LAYERS           = 3
EPOCH_SIZE                  = 8192
NUM_EPOCHS                  = 32
LEARNING_RATE               = 0.0001
DROPOUT_RATE                = 0.5
SCALE                       = torch.tensor([1], dtype=torch.float).to(DEVICE)

def train(net):
    train_loss              = 0.0
    net.train()
    for mb_idx in range(EPOCH_SIZE):
        features, labels    = next(READER_TRAIN_ITER) #Read in a new mini-batch of data!
        out                 = torch.cat(tuple([net(features[i]) for i in range(2)]), 1)
        loss                = criterion(out, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss         += loss.item()
    return train_loss / EPOCH_SIZE

def test(net, ep_idx, train_loss):
    net.eval()
    reader_test             = DataReaderTest(DATA_FILE_TEST)
    reader_test_iter        = iter(reader_test)
    results                 = {}
    for features, qids, labels, cnt in reader_test_iter:
        out                 = net(features).data.cpu()
        row_cnt             = len(qids)
        for i in range(row_cnt):
            if qids[i] not in results:
                results[qids[i]] = []
            results[qids[i]].append((labels[i], out[i][0]))
    avgndcg                 = 0
    avgdcg                  = 0
    for qid, docs in results.items():
        dcg                 = 0
        ranked              = sorted(docs, key=lambda x: x[1], reverse=True)
        for i in range(min(10, len(ranked))):
            rank            = i + 1
            label           = ranked[i][0]
            dcg            += ((2**label - 1) / math.log2(rank + 1))
        idcg                = 0
        ranked              = sorted(docs, key=lambda x: x[0], reverse=True)
        for i in range(min(10, len(ranked))):
            rank            = i + 1
            label           = ranked[i][0]
            idcg           += ((2**label - 1) / math.log2(rank + 1))
        avgdcg             += dcg
        if idcg > 0:
            avgndcg        += (dcg / idcg)
    avgdcg                 /= len(results)
    avgndcg                /= len(results)
    print_message('epoch:{}, loss: {}, dcg: {}, ndcg: {}'.format(ep_idx, train_loss, avgdcg, avgndcg))

torch.manual_seed(1)
net                         = DNN().to(DEVICE)
criterion                   = nn.CrossEntropyLoss()
optimizer                   = optim.Adam(net.parameters(), lr=LEARNING_RATE)
print_message('Number of learnable parameters: {}'.format(net.parameter_count()))
print_message('Learning rate: {}'.format(LEARNING_RATE))
test(net, 0, 'n/a')
for ep_idx in range(NUM_EPOCHS):
    train_loss              = train(net)
    test(net, ep_idx + 1, str(train_loss))
print_message('Finished training')

[Jan 27, 08:05:21] Number of learnable parameters: 51457
[Jan 27, 08:05:21] Learning rate: 0.0001
[Jan 27, 08:06:03] epoch:0, loss: n/a, dcg: 3.5303611055939688, ndcg: 0.1742251117653203
