In [1]:
import json
from abc import ABCMeta, abstractmethod

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('..')

In [4]:
from train.train_debug import *

In [5]:
Z, X1, X2, Y1, Y2, params, preprocess_info = load_training_data('../training_data/example_query/','q0')

In [6]:
feature_generator = FeatureGenerator()

In [7]:
feature_generator.fit(X1 + X2)

RelType :  {'Hash Join', 'Seq Scan', 'Index Scan', 'Nested Loop', 'Hash'}


In [8]:
vars(feature_generator.normalizer)

{'_mins': {'Startup Cost': 0.0,
  'Total Cost': 0.371563556432483,
  'Plan Rows': 0.6931471805599453},
 '_maxs': {'Startup Cost': 13.564519206060003,
  'Total Cost': 13.673290407169015,
  'Plan Rows': 17.404826908902354}}

In [9]:
vars(feature_generator.feature_parser)

{'normalizer': <model.feature.Normalizer at 0x2520590dd08>,
 'input_relations': ['title', 'name', 'cast_info', 'aka_title', 'char_name']}

In [10]:
vars(feature_generator.feature_parser.normalizer)

{'_mins': {'Startup Cost': 0.0,
  'Total Cost': 0.371563556432483,
  'Plan Rows': 0.6931471805599453},
 '_maxs': {'Startup Cost': 13.564519206060003,
  'Total Cost': 13.673290407169015,
  'Plan Rows': 17.404826908902354}}

In [11]:
len(X1), len(Z)

(9, 9)

In [12]:
list(zip(*Z))

[('2014', '2014', '2014', '2013', '2013', '2013', '2013', '2013', '2013'),
 ('%John%',
  '%John%',
  '%John%',
  '%smith%',
  '%smith%',
  '%smith%',
  '%John%',
  '%John%',
  '%John%')]

In [13]:
X1_ = feature_generator.transform(X1)
X2_ = feature_generator.transform(X2)
Z_ = feature_generator.transform_z(Z, params, preprocess_info)

In [14]:
Z_

array([[34,  0],
       [34,  0],
       [34,  0],
       [33,  1],
       [33,  1],
       [33,  1],
       [33,  0],
       [33,  0],
       [33,  0]])

In [15]:
Y1_ = np.array(Y1).reshape(-1, 1)
Y2_ = np.array(Y2).reshape(-1, 1)

In [16]:
input_feature_dim = len(X1_[0].get_feature())
input_feature_dim

23

In [17]:
from model.model import *
plan_net = PlanEmbeddingNet(input_feature_dim)
parameter_net = ParameterEmbeddingNet('q0', preprocess_info)

In [18]:
pairs = []
for i in range(len(X1_)):
    pairs.append((X1_[i], X2_[i], Z_[i], 1.0 if Y1[i] <= Y2[i] else 0.0))

In [19]:
from torch.utils.data import Dataset
from model.TreeConvolution.util import *
class PairDataset(Dataset):
    def __init__(self, X1, X2, Y1, Y2, Z):
        self.X1 = X1
        self.X2 = X2
        self.Y = []
        for y1, y2 in zip(Y1, Y2):
            if y1 <= y2:
                self.Y.append(1.)
            else:
                self.Y.append(0.)
        self.Z = Z

    def __len__(self):
        return len(self.X1)

    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.Z[idx], self.Y[idx]

In [20]:
class Batch():
    def __init__(self, X1, X2, Z, Y):
        self.X1 = X1
        self.X2 = X2
        self.Y = Y
        self.Z = Z

    def to(self,dev):
        self.X1 = X1.to(dev)
        self.X2 = X2.to(dev)
        self.Y = Y.to(dev)
        self.Z = Z.to(dev)
        return self

In [21]:
def collate_pairwise_fn(x):
    trees1 = []
    trees2 = []
    parameters2 = []
    labels = []

    for tree1, tree2, parameter2, label in x:
        trees1.append(tree1)
        trees2.append(tree2)
        parameters2.append(parameter2)
        labels.append(label)
    return trees1, trees2, torch.FloatTensor(parameters2), torch.FloatTensor(np.array(labels)).reshape(-1, 1)

In [22]:
ds = PairDataset(X1_, X2_, Y1_, Y2_, Z_)

In [23]:
from torch.utils.data import DataLoader
ld = DataLoader(ds, batch_size=2, shuffle=True, collate_fn=collate_pairwise_fn)

In [24]:
ds[1]

(<model.feature.SampleEntity at 0x25205939308>,
 <model.feature.SampleEntity at 0x25205948f48>,
 array([34,  0]),
 1.0)

In [25]:
Z_

array([[34,  0],
       [34,  0],
       [34,  0],
       [33,  1],
       [33,  1],
       [33,  1],
       [33,  0],
       [33,  0],
       [33,  0]])

In [26]:
a = next(iter(ld))
a

  if sys.path[0] == "":


([<model.feature.SampleEntity at 0x25205944088>,
  <model.feature.SampleEntity at 0x25205939308>],
 [<model.feature.SampleEntity at 0x25205963f88>,
  <model.feature.SampleEntity at 0x25205948f48>],
 tensor([[33.,  0.],
         [34.,  0.]], dtype=torch.float32),
 tensor([[1.],
         [1.]], dtype=torch.float32))

In [27]:
plan_optimizer = torch.optim.Adam(plan_net.parameters())
parameter_optimizer = torch.optim.Adam(parameter_net.parameters())
bce_loss_fn = torch.nn.BCELoss()

In [28]:
x1, x2, z, label = a

In [29]:
tree_x1 = plan_net.build_trees(x1)
tree_x2 = plan_net.build_trees(x2)

In [30]:
y_pred_1 = plan_net(tree_x1)
y_pred_2 = plan_net(tree_x2)
z_pred = parameter_net(z)

In [74]:
y_pred_1.dtype, z_pred.dtype

(torch.float64, torch.float64)

In [47]:
y_pred_1.shape, z_pred.shape

(torch.Size([2, 32]), torch.Size([2, 32]))

In [32]:
batch_size = z.size(0)
batch_size

2

In [33]:
x_l = torch.split(z, 1, dim=-1) 
x_l

(tensor([[33.],
         [34.]], dtype=torch.float32),
 tensor([[0.],
         [0.]], dtype=torch.float32))

In [83]:
embedded = []
for x_i, e in zip(x_l, parameter_net.embed_layers):
    print(x_i)
    if not isinstance(e, nn.Identity):
        embedded.append(e(x_i.long()).view(batch_size, -1))
    else:
        embedded.append(e(x_i))

tensor([[33.],
        [34.]], dtype=torch.float32)
tensor([[0.],
        [0.]], dtype=torch.float32)


In [82]:
e(x_i.long()).dtype

torch.float64

In [80]:
embedded

[tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 tensor([[-0.2701, -0.8241, -0.8562, -1.9137, -0.0558, -0.9335, -0.4713, -0.9412,
          -0.4025, -0.1374,  0.4396, -1.2399,  0.4492,  0.2594, -0.6063, -1.3823,
          -1.2451,  0.5592, -0.4497, -0.9276,  0.3364, -1.8150,  2.3229, -0.4331,
          -1.4283,  0.6416,  1.7311, -0.2279,  0.2552, -0.0767, -0.9744,  2.1503],
         [-0.2701, -0.8241, -0.8562, -1.9137, -0.0558, -0.9335, -0.4713, -0.9412,
          -0.4025, -0.1374,  0.4396, -1.2399,  0.4492,  0.2594, -0.6063, -1.3823,
          -1.2451,  0.5592, -0.4497, -0.927

In [75]:
torch.concat(embedded, -1).dtype

torch.float64

In [36]:
parameter_net.embed_len

82

In [48]:
distance_1 = torch.norm(y_pred_1 - z_pred, dim = 1)
distance_2 = torch.norm(y_pred_2 - z_pred, dim = 1)
prob_y = 1.0 if distance_1 <= distance_2 else 0.0

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [63]:
prob_y = torch.sigmoid(distance_1 - distance_2)

In [64]:
distance_1

tensor([5.0797, 5.0919], grad_fn=<CopyBackwards>)

In [65]:
prob_y

tensor([0.5382, 0.4848], grad_fn=<SigmoidBackward0>)

In [85]:
loss = bce_loss_fn(prob_y.view(-1,1).float(), label)

In [86]:
loss

tensor(0.6717, dtype=torch.float32, grad_fn=<BinaryCrossEntropyBackward0>)

In [70]:
prob_y.dtype

torch.float64

In [69]:
distance_1

tensor([5.0797, 5.0919], grad_fn=<CopyBackwards>)

In [88]:
loss.backward()

In [89]:
plan_optimizer.step()
parameter_optimizer.step()