In [1]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

# import 

In [2]:
import os
import numpy as np
import tensorflow as tf

gpu = ""

os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = gpu

tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
tf_config.gpu_options.per_process_gpu_memory_fraction = 0.3

tf.enable_eager_execution(tf_config)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [7]:
import argparse
import json
import os
import time
import munch
from tqdm import tqdm
import pickle
import numpy as np
from sklearn import metrics
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import torch.nn.functional as func

import tensorflow as tf
import numpy as np
import data_loader.data_generator as data_generator
from data_loader.data_generator import load_embeddings, load_vocab
from models.spen import SPEN
from trainers.spen_trainer import SpenTrainer
from utils.config import process_config
from utils.dirs import create_dirs
from utils.logger import get_logger, TFLogger
from utils.utils import get_args
from models.init import weight_init_tf


In [18]:
args = argparse.Namespace()
args.seed = 1
args.config = 'configs/bibtex.json'
np.random.seed(args.seed)
tf.set_random_seed(args.seed)
config = process_config(args.config)
config.summary_dir = 'debug_tf2torch_summary'
config.checkpoint_dir = 'debug_tf2torch_checkpoint'
create_dirs([config.summary_dir, config.checkpoint_dir])

0

# load data

In [10]:
np.random.seed(args.seed)
tf.set_random_seed(args.seed)

generator = eval("data_generator.%s" % config.data.data_generator)

dsplits = config.data.splits
train_data = generator(config, split=dsplits[0])
print("training set loaded :- %d instances", train_data.len)
dev_data = generator(config, split=dsplits[1])
print("dev set loaded :- %d instances", dev_data.len)
test_data = generator(config, split=dsplits[2])
print("test set loaded :- %d instances", test_data.len)

#%%

with open('data/bibtex/train.pickle', "rb") as f:
    temp = pickle.load(f)
    data_x = np.array([instance['feats'] for instance in temp])
    data_y = np.array([instance['types'] for instance in temp])

#%%

with open('data/bibtex/test.pickle', "rb") as f:
    temp = pickle.load(f)
    test_x = np.array([instance['feats'] for instance in temp])
    test_y = np.array([instance['types'] for instance in temp])


training set loaded :- %d instances 4880
dev set loaded :- %d instances 2515
test set loaded :- %d instances 2515


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data_x = torch.FloatTensor(data_x).to(device)
data_y = torch.FloatTensor(data_y).to(device)
test_x = torch.FloatTensor(test_x).to(device)
test_y = torch.FloatTensor(test_y).to(device)

  return torch._C._cuda_getDeviceCount() > 0


# load models to pytorch

In [58]:
def tf2torch(checkpoint, feat_net, inf_net, energy_net):
    
    tf_path = os.path.abspath(checkpoint)
    init_vars = tf.train.list_variables(tf_path)

    tf_vars = []
    for name, shape in init_vars:
        # print("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_path, name)
        tf_vars.append((name, array.squeeze()))
    
    feat_i = 12
    energy_i = 8
    inf_i = 18
    feat_net.layer1.bias.data = torch.from_numpy(tf_vars[feat_i][1].T)
    feat_net.layer1.weight.data = torch.from_numpy(tf_vars[feat_i + 1][1].T)
    feat_net.layer2.bias.data = torch.from_numpy(tf_vars[feat_i + 2][1].T)
    feat_net.layer2.weight.data = torch.from_numpy(tf_vars[feat_i + 3][1].T)
    
    energy_net.C1.bias.data = torch.from_numpy(tf_vars[energy_i][1].T)
    energy_net.C1.weight.data = torch.from_numpy(tf_vars[energy_i + 1][1].T)
    energy_net.c2.weight.data = torch.from_numpy(tf_vars[energy_i + 2][1].T)
    energy_net.linear_wt.weight.data = torch.from_numpy(tf_vars[energy_i + 3][1].T)


    inf_net.layer1.bias.data = torch.from_numpy(tf_vars[inf_i][1].T)
    inf_net.layer1.weight.data = torch.from_numpy(tf_vars[inf_i + 1][1].T)
    inf_net.layer2.bias.data = torch.from_numpy(tf_vars[inf_i + 2][1].T)
    inf_net.layer2.weight.data = torch.from_numpy(tf_vars[inf_i + 3][1].T)
    inf_net.layer3.weight.data = torch.from_numpy(tf_vars[inf_i + 4][1].T)
    
    return feat_net, inf_net, energy_net

feat_net1 = MLP()
inf_net1 = InfNet()
energy_net1 = EnergyNet()
feat_net1, inf_net1, energy_net1 = tf2torch('./copied.ckpt', feat_net1, inf_net1, energy_net1)
with torch.no_grad():
    pred_test = inf_net1(test_x)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)


with torch.no_grad():
    feat = feat_net1(test_x)
    _, pred_test = energy_net1(feat, test_y)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)

0.4105516296229916 0.3304564718587311
0.4105516296229916 0.3304564718587311


0.4084047359374391 0.3278044250686946
0.4105516296229916 0.33045647185873117


# load copied models

In [None]:
checkpoint = './copied.ckpt'
tf_path = os.path.abspath(checkpoint)
init_vars = tf.train.list_variables(tf_path)

tf_vars = []
for name, shape in init_vars:
    print("Loading TF weight {} with shape {}".format(name, shape))
    array = tf.train.load_variable(tf_path, name)
    tf_vars.append((name, array.squeeze()))

In [23]:
for i, (name, shape) in enumerate(init_vars):
    if 'Adam' in name:
        continue
    print(i, name)

0 model/copy_inference_net/layer1/bias
1 model/copy_inference_net/layer1/kernel
2 model/copy_inference_net/layer2/bias
3 model/copy_inference_net/layer2/kernel
4 model/copy_inference_net/layer3/kernel
5 model/cur_epoch/cur_epoch_0
6 model/cur_epoch/cur_epoch_1
7 model/cur_epoch/cur_epoch_2
8 model/energy_net/label_energy1/bias
9 model/energy_net/label_energy1/kernel
10 model/energy_net/label_energy2/kernel
11 model/energy_net/linear_wt
12 model/feature_net/layer1/bias
13 model/feature_net/layer1/kernel
14 model/feature_net/layer2/bias
15 model/feature_net/layer2/kernel
16 model/global_step/global_step
17 model/global_step/global_step_inf
18 model/inference_net/layer1/bias
19 model/inference_net/layer1/kernel
20 model/inference_net/layer2/bias
21 model/inference_net/layer2/kernel
22 model/inference_net/layer3/kernel
59 model/phi_opt/beta1_power
60 model/phi_opt/beta2_power
61 model/pretrain_feats/beta1_power
62 model/pretrain_feats/beta2_power
63 model/psi_opt/beta1_power
64 model/psi_o

In [17]:
def f1_map(y, pred, threshold=None):
    if threshold is None:
        threshold = [0.05, 0.10, 0.15, 0.2, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70, 0.75]
    else:
        threshold = [0.5]
    best_f1 = 0
    for t in threshold:
        local_pred = pred > t
        local_f1 = f1_score(y.data.cpu().numpy(), local_pred.data.cpu().numpy(), average='samples')
        if local_f1 > best_f1:
            best_f1 = local_f1
    precision = np.mean(metrics.average_precision_score(
        y.data.cpu().numpy(), pred.data.cpu().numpy(), average=None
    ))

    return best_f1, precision

In [85]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(1836, 150)
        self.layer2 = nn.Linear(150, 150)      
    
    def forward(self, x):
        out = func.relu(self.layer1(x))
        out = func.relu(self.layer2(out))
        return out

class EnergyNet(nn.Module):
    def __init__(self, weights_last_layer_mlp=150, feature_dim=150, label_dim=159,
                 num_pairwise=16, non_linearity=nn.Softplus()):
        super().__init__()

        self.non_linearity = non_linearity

        self.linear_wt = nn.Linear(150, label_dim, bias=False) 

        # Label energy terms, C1/c2  in equation 5 of SPEN paper
        self.C1 = nn.Linear(label_dim, num_pairwise)

        self.c2 = nn.Linear(num_pairwise, 1, bias=False)

    def forward(self, x, y):
        # Local energy
        negative_logits = self.linear_wt(x)
        feat_probs = torch.sigmoid(-1 * negative_logits)
        
        # element-wise product
        e_local = torch.mul(negative_logits, y)
        e_local = torch.sum(e_local, dim=1)

        # Label energy
        e_label = self.non_linearity(self.C1(y))
        e_label = self.c2(e_label)
        e_global = torch.add(e_label, e_local)

        return e_global, feat_probs
    

    
class InfNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(1836, 150)
        self.layer2 = nn.Linear(150, 150)
        self.layer3 = nn.Linear(150, 159, bias=False) 
    
    def forward(self, x):
        out = func.relu(self.layer1(x))
        out = func.relu(self.layer2(out))
        out = self.layer3(out)
        return torch.sigmoid(out)
    
class SPEN():
    def __init__(self, feature_net, energy_net, inf_net, n_steps_inf=1, input_dim=1836, label_dim=159):
        self.feature_extractor = feature_net
        self.feature_extractor.eval()
        self.energy_net = energy_net
        self.inf_net = inf_net
        
        self.phi0 = InfNet().to(device)
        self.phi0.load_state_dict(inf_net.state_dict())
        
    def _compute_energy(self, inputs, targets):
        f_x = self.feature_extractor(inputs)
        
        # Energy ground truth
        gt_energy, _ = self.energy_net(f_x, targets)
        
        # Cost-augmented inference network
        pred_probs = self.inf_net(inputs)
        
        pred_energy, _ = self.energy_net(f_x, pred_probs)
        
        return pred_probs, pred_energy, gt_energy
    
    def compute_loss(self, inputs, targets):
        
        pred_probs, pred_energy, gt_energy = self._compute_energy(inputs, targets)
        # Max-margin Loss
        delta = torch.sum((pred_probs - targets)**2, dim=1)
        pre_loss_real = delta - pred_energy + gt_energy
        
        energy_loss = torch.relu(pre_loss_real)
        pre_loss_real = torch.mean(pre_loss_real)
        energy_loss = torch.mean(energy_loss)

        entropy_loss = nn.BCELoss()(pred_probs, pred_probs.detach())
        
        reg_losses_phi = 0.5 * sum(p.pow(2.0).sum() for p in self.inf_net.parameters())
        pretrain_bias = 0.5 * sum((x - y).pow(2.0).sum() for x, y in zip(self.inf_net.state_dict().values(), self.phi0.state_dict().values()))
        reg_losses_theta = sum(p.pow(2.0).sum() for p in self.energy_net.parameters())
        
        inf_net_loss = energy_loss \
                       - 0.001 * reg_losses_phi\
                       - 1 * pretrain_bias \
                       - 1 * entropy_loss
        inf_net_loss = -inf_net_loss
        
        e_net_loss = energy_loss + 0.001 * reg_losses_theta
        
        summaries = {
            'base_objective': energy_loss.item(),
            'base_obj_real': pre_loss_real.item(),
            'energy_inf_net': pred_energy.mean().item(),
            'energy_ground_truth': gt_energy.mean().item(),
            'reg_losses_theta': reg_losses_theta.item(),
            'reg_losses_phi': reg_losses_phi.item(),
            'reg_losses_entropy': entropy_loss.item(),
            'pretrain_bias': pretrain_bias.item()
        }
        print(summaries)
        
        return pred_probs, e_net_loss, inf_net_loss

    def pred(self, x):
        with torch.no_grad():
            y_pred = self.inf_net(x)
        return y_pred
    
    def inference(self, x, training=False, n_steps=1):
        
        sd = self.inf_net.state_dict()
        inf_net2 = MLP()
        inf_net2.load_state_dict(sd)
        self.inf_net.eval()
        optimizer = optim.SGD(self.inf_net.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
        with torch.no_grad():
            y_pred = self.inf_net(x)
        
        self.inf_net.train()
        
        return y_pred

feat_net1 = MLP()
inf_net1 = InfNet()
energy_net1 = EnergyNet()
feat_net1, inf_net1, energy_net1 = tf2torch('./copied.ckpt', feat_net1, inf_net1, energy_net1)
f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)

spen = SPEN(feat_net1, energy_net1, inf_net1)
preds, e_loss, inf_loss = spen.compute_loss(data_x[:32], data_y[:32])

0.4105516296229916 0.3304564718587311
{'base_objective': 0.8605382442474365, 'base_obj_real': -1.0311719179153442, 'energy_inf_net': -3.121706962585449, 'energy_ground_truth': -4.839284420013428, 'reg_losses_theta': 373.61114501953125, 'reg_losses_phi': 1181.28125, 'reg_losses_entropy': 0.028994059190154076, 'pretrain_bias': 0.0}


In [81]:
sum(p.pow(2.0).sum() for p in inf_net1.parameters())

tensor(2362.5625, grad_fn=<AddBackward0>)

In [73]:

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(1836, 150)
        self.layer2 = nn.Linear(150, 150)      
    
    def forward(self, x):
        out = func.relu(self.layer1(x))
        out = func.relu(self.layer2(out))
        return out

class EnergyNet(nn.Module):
    def __init__(self, weights_last_layer_mlp=150, feature_dim=150, label_dim=159,
                 num_pairwise=16, non_linearity=nn.Softplus()):
        super().__init__()

        self.non_linearity = non_linearity

        self.linear_wt = nn.Linear(150, label_dim, bias=False) 

        # Label energy terms, C1/c2  in equation 5 of SPEN paper
        self.C1 = nn.Linear(label_dim, num_pairwise)

        self.c2 = nn.Linear(num_pairwise, 1, bias=False)

    def forward(self, x, y):
        # Local energy
        negative_logits = self.linear_wt(x)
        feat_probs = torch.sigmoid(-1 * negative_logits)
        
        # element-wise product
        e_local = torch.mul(negative_logits, y)
        print(e_local.shape)
        e_local = torch.sum(e_local, dim=1)
        print(e_local.shape)

        # Label energy
        e_label = self.non_linearity(self.C1(y))
        print(e_label.shape)
        e_label = self.c2(e_label)
        print(e_label.shape)
        e_global = torch.add(e_label, e_local)
        print(e_global.shape)

        return e_global, feat_probs
    
feat_net1 = MLP()
inf_net1 = InfNet()
energy_net = EnergyNet()
feat_net1, inf_net1, energy_net = tf2torch('./copied.ckpt', feat_net1, inf_net1, energy_net)
with torch.no_grad():
    pred_test = inf_net1(test_x)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)


with torch.no_grad():
    feat = feat_net1(test_x)
    _, pred_test = energy_net(feat, test_y)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)

0.4105516296229916 0.3304564718587311
torch.Size([2515, 159])
torch.Size([2515])
torch.Size([2515, 15])
torch.Size([2515])
torch.Size([2515])
0.4105516296229916 0.3304564718587311


In [60]:
feat_net1 = MLP()
inf_net1 = InfNet()
energy_net1 = EnergyNet()
feat_net1, inf_net1, energy_net1 = tf2torch('./copied.ckpt', feat_net1, inf_net1, energy_net1)
with torch.no_grad():
    pred_test = inf_net1(test_x)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)


with torch.no_grad():
    feat = feat_net1(test_x)
    _, pred_test = energy_net1(feat, test_y)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)

0.4105516296229916 0.3304564718587311
0.4105516296229916 0.3304564718587311


In [62]:
spen = SPEN(feat_net1, energy_net1, inf_net1)
preds, e_loss, inf_loss = spen.compute_loss(data, label)
pred_test = spen.pred(test_x)
best_f1, mAP = f1_map(test_y, pred_test)
print(best_f1, mAP)

0.4105516296229916 0.3304564718587311


# cost trained model

In [51]:
feat_net2 = MLP()
inf_net2 = MLP()
feat_net2, inf_net2, _ = tf2torch('./cost.ckpt', feat_net2, inf_net2, None)
with torch.no_grad():
    pred_test = inf_net2(test_x, pretrain=False)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)

0.4084047359374391 0.3278044250686946


# best model

In [49]:
feat_net2 = MLP()
inf_net2 = MLP()
feat_net2, inf_net2, _ = tf2torch('./best.ckpt', feat_net2, inf_net2, None)
with torch.no_grad():
    pred_test = inf_net2(test_x, pretrain=False)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)


with torch.no_grad():
    pred_test = feat_net2(test_x)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)

0.4186186497693001 0.3267193167443796
0.4105516296229916 0.33045647185873117


# Final model

In [43]:
feat_net1 = MLP()
inf_net1 = MLP()
feat_net1, inf_net1, _ = tf2torch('./final.ckpt', feat_net1, inf_net1, None)
with torch.no_grad():
    pred_test = inf_net1(test_x)

f1, mAP = f1_map(test_y, pred_test)
print(f1, mAP)

0.39316385186990854 0.32959706166709757
