In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
import random

In [2]:
import numpy as np

**CLASSIFIER UTILS**

In [1]:
def train_classifier(model, lines, output, batches, iterations=1):
    '''
    model: Pytorch class instance which inherits nn.Module
    lines: list of documents, each document is a list of list of tokens' indices
    output: binary labels for each sentence
    '''
    errors = []
    lossfunc = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    for _ in range(iterations):
        for batch in batches:
            try:
                if len(batch) == 20:
                    input = torch.cuda.LongTensor([lines[n] for n in batch])
                    truth = torch.cuda.FloatTensor([output[n] for n in batch])
                    truth[truth == 2] = 0
                    pred = model(input)
                    model.zero_grad()
                    optimizer.zero_grad()
                    loss = lossfunc(pred, truth)
                    loss.backward()
                    optimizer.step()
                    errors.append(loss.data)
            except:
                print('ad')
            print(batches.index(batch), end='\r')
        plt.plot(errors)

In [8]:
torch.FloatTensor([0.1, 0, 2, 3]).ge(0.5).data

tensor([0, 0, 1, 1], dtype=torch.uint8)

In [3]:
def get_summaries(model, lines_, batches, doc_folder, write_folder, output_dim=1, summary_len='variable'):
    '''
    model: Pytorch class instance which inherits nn.Module
    lines_: list of documents, each document is a list of list of tokens' indices
    batches: indices of documents grouped by batches
    write_folder: folder to which to write the summary to
    output_dim: no. of outputs given by the model for each sentence, can only be 1 or 2
    summary_len: no. of lines to extract as summary. a number or string 'variable'
    '''
    for batch in batches:
        input = torch.cuda.LongTensor([lines_[n] for n in batch])
        pred = model(input)

        if output_dim == 1:
            _idx = torch.sort(pred, descending=True)[1].data
            for lines, docid in zip(_idx, batch):
                with open(doc_folder + str(docid)) as f:
                    content = f.readlines()
                    selected_lines = [content[l] for l in lines[0:3]]
        else:
            for lines, docid in zip(pred.data, batch):
                with open(doc_folder + str(docid)) as f:
                    content = f.readlines()
                    #print([l[1] for l in lines])
                    #selected_lines = [content[i] for i, l in zip(range(len(lines)), lines) if l[1].ge(0.5)]
                    pos = [l[1].data for l in lines]
                    neg = [l[0].data for l in lines]
                    if summary_len == 'variable':
                        selected_lines = []
                        for i in range(len(pos)):
                            if pos[i] > neg[i]:
                                selected_lines.append(i)
                    else:
                        selected_lines = list(zip(*sorted(zip(pos, range(len(pos))), key=lambda x: x[0], reverse=True)))[1][0:summary_len]
                    #print(selected_lines)
                    #print(len(content))
                    #print(docid, len(content), selected_lines, batch)
                    selected_lines = [content[s] for s in selected_lines if s < len(content) ]
            
                with open(write_folder + str(docid), 'w+') as f2:
                    [f2.write(line) for line in selected_lines]


In [None]:
def get_file(doc_id, write_folder):
    print(open(write_folder + str(doc_id)).read())

In [5]:
def compute_accuracies(model, lines_, output, batches):
    '''
    computes precision, recall, f1 on output labels
    '''
    tp, fn, fp = 0, 0, 0
    for batch in batches:
        input = torch.cuda.LongTensor([lines_[n] for n in batch])
        truth = torch.cuda.ByteTensor([output[n] for n in batch])
        pred = model(input)
        tp += torch.sum(pred.gt(0.5) * truth)
        fn += torch.sum(pred.le(0.5) * truth)
        fp += torch.sum(pred.gt(0.5) * truth.le(0.))
    tp = tp.float()
    fn = fn.float()
    fp = fp.float()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall /(precision + recall)
    
    return {'precision': precision, 'recall': recall ,'f1': f1}

**REFRESH UTILS**

In [6]:
def compute_reinforce_loss(bceloss, pred_, scores_, max_=5):
    '''
    TODO: probably same as compute_refresh_loss_avg_sample, remove it
    '''
    l = Variable(torch.zeros(1), requires_grad=True).cuda()
    for pred, scores in zip(pred_, scores_):
        for sentences, score in scores[0:max_]:
            truth = np.zeros(pred.size(0))
            truth[list(sentences)] = 1
            truth = torch.cuda.FloatTensor(truth)
            score = torch.cuda.FloatTensor([score])[0]
            l = l + bceloss(pred, truth) * score
    return l / (20. * max_)

In [None]:
def train_reinforce(model, lines, scores, batches, iterations, max_=5):
    errors = []
    lossfunc = nn.BCELoss()
    optimizer = optim.Adam(model.parameters())
    for _ in range(iterations):
        for batch in batches:
            if len(batch) == 20:
                optimizer.zero_grad()
                input = torch.cuda.LongTensor([lines[n] for n in batch])
                scores_ = [scores[i] for i in batch]
                pred = model(input)

                loss = compute_reinforce_loss(lossfunc, pred, scores_, max_)
                loss.backward()
                optimizer.step()
                errors.append(loss.data)



                print(loss.data, batches.index(batch), end='\r')
        plt.plot(errors)

In [None]:
def compute_refresh_loss_avg_sample(lossfunc, pred_, scores_, max_):
    l = Variable(torch.zeros(1), requires_grad=True).cuda()
    for pred, scores in zip(pred_, scores_):
        for sentences, score in scores[0:max_]:
            truth = np.zeros(pred.size(0))
            truth[list(sentences)] = 1
            truth = torch.cuda.LongTensor(truth)
            score = torch.cuda.FloatTensor([score])[0]
            l = l + lossfunc(pred, truth) * score
    return l / (20. * max_)   

In [None]:
def compute_refresh_loss_single_sample(lossfunc, pred_, scores_, max_):
    l = Variable(torch.zeros(1), requires_grad=True).cuda()
    for pred, scores in zip(pred_, scores_):
        randint = random.randint(0, min(max_ , len(scores) - 1))
        #print('\r', randint, end='')
        sentences, score = scores[randint]
        truth = np.zeros(pred.size(0))
        truth[list(sentences)] = 1
        truth = torch.cuda.LongTensor(truth)
        score = torch.cuda.FloatTensor([score])[0]
        l = l + lossfunc(pred, truth) * score
    return l

In [1]:
def train_refresh(model, lines, scores, batches, iterations, max_=5, single_sample=True):
    errors = []
    lossfunc = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters())
    for _ in range(iterations):
        for batch in batches:
            if len(batch) == 20:
                model.zero_grad()

                input = torch.cuda.LongTensor([lines[n] for n in batch])
                scores_ = [scores[i] for i in batch]
                pred = model(input)
                if single_sample:
                    loss = compute_refresh_loss_single_sample(lossfunc, pred, scores_, max_)
                else:
                    loss = compute_refresh_loss_avg_sample(lossfunc, pred, scores_, max_)

                loss.backward()
                nn.utils.clip_grad_value_(model.parameters(), 5)
                optimizer.step()
                errors.append(loss.data)

                print('\r','training: ', loss.data, batches.index(batch), end='')
      #  plt.plot(errors)