In [1]:
# PyTorch libraries and modules
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Linear, ReLU, Sequential, Module, BatchNorm2d, Dropout
from torch.optim import Adam

device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(999) #for reproducibility

<torch._C.Generator at 0x7efed435ebb0>

In [None]:
## data loaders
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import os
import csv
import numpy as np

class DataCombiner(Dataset):
    def __init__(self, ids, features):
        self.ids = ids
        self.features = features
    
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, i):
        return (i, self.ids[i], self.features[i])
    
text_dim = 300
column_dim = 900
train_text_fraction = .32
train_col_fraction = .32

## read text list and column list
def read_ids(file_path):
    with open(file_path, 'r') as f:
        csvf = csv.reader(f, delimiter='\n')
        return [r[0] for r in csvf]

features_dir = '../features'
datalake = 'mlopen' 
gt_path = '../inputs/mlopen-text-tables.gt'
text_id = read_ids(os.path.join(features_dir, datalake + '-textids.list'))
column_id = read_ids(os.path.join(features_dir, datalake + '-colids.list'))
print(f'text_id len: {len(text_id)}, sample: {text_id[:2]}')
print(f'col_id len: {len(column_id)}, sample: {column_id[:2]}')

## load features
text_f = torch.load(os.path.join(features_dir, datalake + '-textfeatures.pt'))
column_f = torch.load(os.path.join(features_dir, datalake + '-columnfeatures.pt'))
print(f'text_f len: {text_f.shape}, column_f len: {column_f.shape}')

## get only a fraction for training
train_text_id = text_id[:int(train_text_fraction * len(text_id))]
train_column_id = column_id[:int(train_col_fraction * len(column_id))]
train_text_f = text_f[:int(train_text_fraction * text_f.shape[0]), :]
train_column_f = column_f[:int(train_col_fraction * column_f.shape[0]), :]
print(f'sizes after filtering training data: {len(train_text_id)}, {len(train_column_id)}, {train_text_f.shape}, {train_column_f.shape}')

## read labels
def load_labels(label_path, text_id, column_id):
    mat = torch.zeros(len(text_id), len(column_id), dtype=torch.float32)
    with open(label_path, 'r') as f:
        csvf = csv.reader(f)
        last_text_key, last_text_index = -1, -1
        for r in csvf:
            try:
                cid = column_id.index(r[1])
                if r[0] == last_text_key: # no need to lookup again
                    tid = last_text_index
                else:
                    tid = text_id.index(r[0])
                    last_text_key = r[0]
                    last_text_index = tid
                mat[tid, cid] = float(r[2]) if len(r) > 2 else 1.
            except:
                continue
    return mat
    
label_dir = '../column-labels'
try:
    label_mat = load_labels(os.path.join(label_dir, datalake + '-snorkel.lbl'), train_text_id, train_column_id).to(device)
    print(f'label mat len: {label_mat.shape}, nonzero: {label_mat.count_nonzero()}')
except RuntimeError as e:
    print(e)
    print(f'Exception: {torch.cuda.memory_allocated(device)}, \
      max allocated: {torch.cuda.max_memory_allocated(device)}, \
      reserved: {torch.cuda.memory_reserved(device)}, \
      max reserved: {torch.cuda.max_memory_reserved(device)}')


train_text = DataCombiner(train_text_id, train_text_f)
train_column = DataCombiner(train_column_id, train_column_f)


In [None]:
## Loss functions
def euclidean_dist(x,y):
    m,n = x.size(0),y.size(0)
    xx = torch.pow(x,2).sum(1,keepdim=True).expand(m,n)
    yy = torch.pow(y,2).sum(dim=1,keepdim=True).expand(n,m).t()
    dist = xx + yy
    dist.addmm_(1,-2,x,y.t())
    dist = dist.clamp(min=1e-12).sqrt()
    return dist

class TripletLoss(nn.Module):
    def __init__(self, margin=0.1, neg_weight=1., normalize_feature=True):
        super(TripletLoss,  self).__init__()
        self.margin = margin
        self.neg_weight = neg_weight
        self.normalize_feature = normalize_feature

    ## emb1: text_bsz*100 features, emb2: column_bsz*100 features, 
    ## label_mat: text_bsz * column_bsz indicates the magnitude of positive relation between each pair
    ## mask1: text_bsz*1 telling to ignore rows where either all labels are zeros or all labels are non-zeros 
    def forward(self, emb1, emb2, label_mat, mask1):
        if self.normalize_feature:
            emb1 = emb1.sigmoid() #F.normalize(emb1)
            emb2 = emb2.sigmoid() #F.normalize(emb2)
        mat_dist = euclidean_dist(emb1, emb2) ## (text_bsz, column_bsz)
        N = mat_dist.size(0) ## ==text_bsz
        eligible = torch.count_nonzero(mask1)
        if eligible == 0:
            tqdm.write('Found all zeros mask')
            return (torch.tensor(0), torch.tensor(1))
        
        # sampling positives, each will be brought closer to anchor
        positives = mask1 * mat_dist * label_mat ## (text_bsz, column_bsz)
        dist_ap = torch.sum(positives, dim=1) ## (text_bsz, 1)
        
        # sampling negatives, hard sampling, the nearest one will be pushed beyond margin
        negatives, neg_indices = torch.sort( mat_dist + 100000.0 * (label_mat + 1 - mask1), dim = 1, descending=False )
        dist_an = negatives[:,0]
        
        #tqdm.write(f'label_mat: {label_mat}, mat_dist: {mat_dist}, \
        #           positives: {positives}, dist_ap: {dist_ap}, negatives: {negatives}, dist_an: {dist_an}')
        # aggregate loss
        loss = torch.sum(dist_ap 
                          + self.neg_weight * torch.max(torch.zeros_like(dist_an), self.margin - dist_an)) / eligible
        prec = (dist_an.data > dist_ap.data).sum() * 1.0 / N
        #tqdm.write(f'Loss: {loss}, Prec: {prec}')
        return (loss, prec)


In [None]:
## Embedding model

class EncoderNet(nn.Module):
    def __init__(self, ip=1000, op=100, hidden1=200, hidden2=200):
        super().__init__()
        torch.manual_seed(0)
        self.net = nn.Sequential( #sequential operation
            nn.Linear(ip, hidden1), 
            nn.ReLU(),
            nn.BatchNorm1d(hidden1),
            nn.Linear(hidden1, hidden2), 
            nn.ReLU(),
            nn.BatchNorm1d(hidden2),
            #nn.Dropout(p=0.2),  
            nn.Linear(hidden2, op)
        )

    def forward(self, X):
        return self.net(X)

text_enet = (EncoderNet(text_dim)).to(device)
column_enet = (EncoderNet(column_dim)).to(device)


In [None]:
## hyper parameters
text_bsz = int(32)
column_bsz = int(32)

epochs = 500
stepsPerEpoch = int(26)

## triplet loss parameters
triplet_margin = 0.1 # Lower the better, but not below 0.1. Higher values improve results when a large fraction of data is used for training
neg_weight = 1 # Lower values lead to no learning, but can go higher for better recall.

# Optimizer
encoder_learning_rate = 0.0001 # triplet loss doesn't converge with 0.001, formula from Dino: lr=0.0005*bsz/256
encoder_decay_rate = 0.001 # For regularization, use 0.001 when using triplet loss

parameters = set()
for net_ in text_enet, column_enet:
  parameters |= set(net_.parameters())
enet_optimizer = torch.optim.Adam(parameters, lr=encoder_learning_rate, weight_decay=encoder_decay_rate, amsgrad=True)

## loss function for semantics transfer across modalities
triplet_loss_fn = TripletLoss(margin=triplet_margin, neg_weight=neg_weight, normalize_feature=True)


In [None]:
## Training and validation module
from tqdm import tqdm

# Dataset load
train_text_loader = DataLoader(train_text, batch_size=text_bsz, shuffle=True, drop_last=False)
train_column_loader = DataLoader(train_column, batch_size=column_bsz, shuffle=True, drop_last=False)

# Training metrics
triplet_losses = []
triplet_accuracies = []

# eval metrics
eval_result_tuples = []

for epoch in tqdm(range(0, epochs)):

    ## validation: need one validation run with random weights, so starting off with it
    if epoch % 10 == 0:
        for _net in text_enet, column_enet:
            _net.eval()
        
        ## write embeddings for all features to disk
        t_emb = text_enet(text_f.to(device)).detach().cpu().numpy()
        c_emb = column_enet(column_f.to(device)).detach().cpu().numpy()
        op_dir = '/tmp'
        np.save(os.path.join(op_dir, datalake + '-' + str(epoch) + '-trainedtext.npy'), t_emb)
        np.save(os.path.join(op_dir, datalake + '-' + str(epoch) + '-trainedcolumns.npy'), c_emb)
        
        ## fetch results
        %run ../evaluate_trained.py -f$op_dir -d $datalake -i$epoch -g$gt_path
        print(eval_results)
        eval_result_tuples.extend(eval_results)
        
        for _net in text_enet, column_enet:
            _net.train()

    ## training
    epoch_loss = 0.
    epoch_acc = 0.

    # start mini-batches
    step_count = 0
    train_text_iterator = iter(train_text_loader)
    train_column_iterator = iter(train_column_loader)
    while (step_count < stepsPerEpoch):
        # read next batch
        try:
            t_idx, t_id, t_f = next(train_text_iterator)
        except StopIteration:
            train_text_iterator = iter(train_text_loader)
            t_idx, t_id, t_f = next(train_text_iterator)
            
        try:
            c_idx, c_id, c_f = next(train_column_iterator)
        except StopIteration:
            train_column_iterator = iter(train_column_loader)
            c_idx, c_id, c_f = next(train_column_iterator)
        
        # to device
        t_idx = t_idx.to(device)
        t_f = t_f.to(device)
        c_idx = c_idx.to(device)
        c_f = c_f.to(device)
        
        # construct label matrix and mask array
        label_batch = label_mat[t_idx, :]
        label_batch = label_batch[:, c_idx]
        mask = torch.count_nonzero(label_batch, dim=1)
        mask = torch.where((mask>0) & (mask<column_bsz), 1, 0).unsqueeze(-1)

        # embeddings
        t_emb = text_enet(t_f)
        c_emb = column_enet(c_f)
        
        # loss
        loss, acc = triplet_loss_fn(t_emb, c_emb, label_batch, mask)
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        # step
        if loss > 0:
            enet_optimizer.zero_grad()
            loss.backward()
            enet_optimizer.step()

        # increment step count
        step_count += 1
        
    # aggregate metrics
    tqdm.write(f'Epoch {epoch}: loss= {epoch_loss/step_count}, accuracy= {epoch_acc/step_count}')
    triplet_losses.append(epoch_loss/step_count)
    triplet_accuracies.append(epoch_acc/step_count)
    
    

In [None]:
## plot training performance
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib inline
set_matplotlib_formats('svg')

plt.figure()
plt.plot(triplet_losses, label='Loss')
plt.title('Training losses')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure()
plt.plot(triplet_accuracies, label='Training Accuracy')
plt.title('Training accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
## validation plots

## save eval result
def write_csv(fp, result):
    with open(fp, 'w') as f:
        csvf = csv.writer(f)
        for r in result:
            csvf.writerow(r)

print('final result: ')
for t in eval_result_tuples[:-9]:
    res = ','.join([str(x) for x in t])
    frac = "{:.2f}".format(train_text_fraction * train_col_fraction)
    print(str(frac) + ',' + res)

# R-precision
eval_rprecision = [t[3] for t in eval_result_tuples if t[1] == 8]
plt.figure()
plt.plot(eval_rprecision)
plt.title('R-precision')
plt.xlabel('Epochs * 10')
plt.ylabel('R-Precision')
plt.legend()
plt.show()

# scatter plot
prec_values = [t[3] for t in eval_result_tuples]
rec_values = [t[4] for t in eval_result_tuples]
colors = [t[0] for t in eval_result_tuples]
plt.figure()
sc = plt.scatter(prec_values, rec_values, c=colors, cmap="RdYlGn")
plt.colorbar(sc)
plt.title('P-R trade-offs with training')
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.show()


In [None]:
## export trained vectors to disk
t_emb = text_enet(text_f.to(device)).detach().cpu().numpy()
c_emb = column_enet(column_f.to(device)).detach().cpu().numpy()

print(f'Writing learned features of shape: {t_emb.shape} and {c_emb.shape}')
op_dir = '../features'
np.save(os.path.join(op_dir, datalake + '-trainedtext.npy'), t_emb)
np.save(os.path.join(op_dir, datalake + '-trainedcolumns.npy'), c_emb)


In [None]:
# evaluate
t_emb = text_enet(text_f.to(device))
c_emb = column_enet(column_f.to(device))

dist = euclidean_dist(t_emb, c_emb)
print(dist)

In [None]:
## to help fit label matrix
print(f'Before emptying: memory allocated: {torch.cuda.memory_allocated(device)}, \
      max allocated: {torch.cuda.max_memory_allocated(device)}, \
      reserved: {torch.cuda.memory_reserved(device)}, \
      max reserved: {torch.cuda.max_memory_reserved(device)}')
torch.cuda.empty_cache()
print(f'After emptying: memory allocated: {torch.cuda.memory_allocated(device)}, \
      max allocated: {torch.cuda.max_memory_allocated(device)}, \
      reserved: {torch.cuda.memory_reserved(device)}, \
      max reserved: {torch.cuda.max_memory_reserved(device)}')

print(torch.__version__)
crow_indices = torch.tensor([0, 2, 4])
col_indices = torch.tensor([0, 1, 0, 1])
values = torch.tensor([1, 2, 3, 4])
csr = torch._sparse_csr_tensor(crow_indices, col_indices, values, dtype=torch.double)
print(csr)