In [18]:
import torch
import pandas as pd
import numpy as np

In [19]:
# constants:
k_dim = 20
gamma = 2

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


We are working with several categories of objects. First of all we have the database:
$$D = \{(e^l, r, e^r)\} \subseteq E \times R \times E =: \Omega$$
where $e^l, e^r$'s are entities and $r$'s are relations, $E, R$ are sets of them. Second of all we are training our model that is concerned with vectors:
$$\overline{e^l}, \overline{e^r}, \overline{r} \in \mathbb{R}^k.$$

Our main object is a triple $s = (e^l, r, e^r)$ for which we define its loss to be
$$l(s) = relu (\gamma + \mathbb{1}_D\cdot ||\overline{e^l}+ \overline{e^r}- \overline{r}|| - \mathbb{1}_{\Omega \backslash D}\cdot ||\overline{e^l}+ \overline{e^r}- \overline{r}||)$$

(where $\gamma$ is some pre-defined margin).

If we put $\overline{D} = D \cup \{(e^l, r, e^r) \; : \; e^l \in D \vee e^r \in D\}$ the overall loss will be 
$$\mathcal{L} = \sum_{s \in \overline{D}} l(s)$$

In [21]:
# reading dataset and extracting sets of entities, relations. 
''' 
datastructure here is following:
we store rel-s in a dictionary with keys being "name1 name2" and 
values being sets of relations between name1 and name2

here we also initialize values that we'll use later for the model
'''


f = open('/content/drive/MyDrive/MLDL/Release/train.txt', "r")

# f = open('/train.txt', 'r')

dataset = {}
ent_vect = {}
rel_vect = {}

while True:
  line = f.readline()
  mas = line.split('\t')
  if len(mas) ==3 :
    ent_l = line.split('\t')[0]
    rel = line.split('\t')[1]

    r = line.split('\t')[2]
    ent_r = r[:len(r)-1]
    # print(ent_l + " " + ent_r)

    val_dat = dataset.get(ent_l + " " + ent_r, -100)
    val_ent_l = ent_vect.get(ent_l, "lol")
    val_ent_r = ent_vect.get(ent_r, "lol")
    val_rel = rel_vect.get(rel, "lol")

    if val_dat != -100:
      dataset[ent_l + " " + ent_r].add(rel)
    else:
      dataset.update({ent_l + " " + ent_r : {rel} })

    if val_ent_l == "lol":
      a = np.random.rand(k_dim)
      ent_vect[ent_l] = a

    if val_ent_r == "lol":
      a = np.random.rand(k_dim)
      ent_vect[ent_r] = a

    if val_rel == "lol":
      a = np.random.rand(k_dim)
      rel_vect[rel] = a
  else:
      break
# print(rel_vect['/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor']) #checking that everything's OK

  if val_rel == "lol":
  if val_ent_r == "lol":
  if val_ent_l == "lol":


In [22]:
import random
import math


''' 
dataset = {} -- consists of      [num_1, num_2, relations(num_1, num_2)    ]

ent_vect = {} -- consists of      [entity, vector],   vectors initialized to be random 

rel_vect = {} -- consists of      [relation, vector],   vectors initialized to be random 
'''

' \ndataset = {} -- consists of      [num_1, num_2, relations(num_1, num_2)    ]\n\nent_vect = {} -- consists of      [entity, vector],   vectors initialized to be random \n\nrel_vect = {} -- consists of      [relation, vector],   vectors initialized to be random \n'

In [23]:
def relu(x):
  return max(x, 0)
def dist(x,y):
  return math.sqrt((x-y)@np.transpose(x-y))

In [24]:
def batch_true(size_b):
  out = []
  for _ in range(size_b):
    name, rels = random.choice(list(dataset.items()))
    rel = random.choice(list(rels))
    [ent1, ent2] = name.split()
    out.append((ent1, rel, ent2))
  return out

def error(e1, r, e2):
  return dist(ent_vect[e1]+rel_vect[r], ent_vect[e2])

def find_cor(e1, r, e2, flag): #another function that samples a corrupted triplet for a given one
  beacon = True
  counter = 0
  while (beacon):
    ent, _ = random.choice(list(ent_vect.items()))
    # print(ent)
    if flag == 1:
      v = (ent + " " + e2)
    else:
      v = (e1 + " " + ent)
    
    if dataset.get(v, "lol") != "lol":
      if not (r in  dataset[v]):
        if flag == 1:
          return (ent, r, e2)
        else:
          return (e1, r, ent)
      else:
        print(ent, r, e2)
    
# print(find_cor("/m/017dcd", "/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor", "/m/06v8s0", 2)) #testing
  
def corrupted_loss(e1, r, e2): #loss for a single point

  a = random.randint(1,2) #use it here to make "corruption"
                          #on one of two ends without decision making
  corr = find_cor(e1,r,e2,a)
  el_ = corr[0]
  r_ = corr[1]
  er_ = corr[2]
  return relu(gamma + error(e1, r, e2) - error(el_, r_, er_))

In [30]:
#here are some additional functions to utilize while training

def norming():
  for name in ent_vect:
    x = ent_vect[name]
    ent_vect[name] = x / np.linalg.norm(x)

def update(batch):
  for true_val in batch:
    a = random.randint(1,2)

    pair = find_cor(true_val[0], true_val[1], true_val[2], a)

    ent_vect[true_val[0]] += 2*(ent_vect[true_val[0]] + rel_vect[true_val[1]] - ent_vect[true_val[2]])
    ent_vect[true_val[2]] -= 2*(ent_vect[true_val[0]] + rel_vect[true_val[1]] - ent_vect[true_val[2]])
    rel_vect[true_val[1]] += 2*(ent_vect[true_val[0]] + rel_vect[true_val[1]] - ent_vect[true_val[2]])

    ent_vect[true_val[0]] -= 2*(ent_vect[pair[0]] + rel_vect[pair[1]] - ent_vect[pair[2]])
    rel_vect[true_val[1]] += 2*(ent_vect[pair[0]] + rel_vect[pair[1]] - ent_vect[pair[2]])
    ent_vect[true_val[2]] -= 2*(ent_vect[pair[0]] + rel_vect[pair[1]] - ent_vect[pair[2]])

In [32]:
#now training:

n_iter = 10
size_b = 10

for _ in range(n_iter):
  norming()
  batch = batch_true(size_b)
  update(batch)

/m/040981l /award/award_nominee/award_nominations./award/award_nomination/award_nominee /m/02l6dy
/m/01wy5m /award/award_nominee/award_nominations./award/award_nomination/award_nominee /m/02l6dy
/m/026v437 /award/award_nominee/award_nominations./award/award_nomination/award_nominee /m/02l6dy
/m/02lfns /award/award_nominee/award_nominations./award/award_nomination/award_nominee /m/02l6dy
/m/0dyztm /award/award_nominee/award_nominations./award/award_nomination/award_nominee /m/02l6dy
/m/02sb1w /award/award_nominee/award_nominations./award/award_nomination/award_nominee /m/02l6dy
/m/03yj_0n /award/award_nominee/award_nominations./award/award_nomination/award_nominee /m/02l6dy
/m/03yj_0n /award/award_nominee/award_nominations./award/award_nomination/award_nominee /m/02l6dy
/m/034np8 /film/actor/film./film/performance/film /m/02qzh2
/m/0f7hw /film/film/language /m/02h40lc
/m/063fh9 /film/film/language /m/02h40lc
/m/04nnpw /film/film/language /m/02h40lc
/m/07_fj54 /film/film/language /m/02h4

KeyboardInterrupt: ignored