# Installing pykeen

In [None]:
cd "/content/drive/MyDrive/thesis_project/pykeen"

/content/drive/MyDrive/thesis_project/pykeen


In [None]:
!pip install -e .
!pip install sentence-transformers

Obtaining file:///content/drive/My%20Drive/thesis_project/pykeen
Collecting dataclasses-json
  Downloading https://files.pythonhosted.org/packages/c3/89/390710a3ad24833f2ad11295caddd37f1bc4e60a278de1c9059727a92482/dataclasses_json-0.5.4-py3-none-any.whl
Collecting scipy>=1.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/7d/e8/43ffca541d2f208d516296950b25fe1084b35c2881f4d444c1346ca75815/scipy-1.6.3-cp37-cp37m-manylinux1_x86_64.whl (27.4MB)
[K     |████████████████████████████████| 27.4MB 2.0MB/s 
Collecting click_default_group
  Downloading https://files.pythonhosted.org/packages/22/3a/e9feb3435bd4b002d183fcb9ee08fb369a7e570831ab1407bc73f079948f/click-default-group-1.2.2.tar.gz
Collecting optuna>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/1a/18/b49ca91cf592747e19f2d333c2a86cd7c81895b922a5a09adf6335471576/optuna-2.8.0-py3-none-any.whl (301kB)
[K     |████████████████████████████████| 307kB 44.4MB/s 
Collecting more_click
  Downloading https://file

In [None]:
cd "/content/drive/MyDrive/thesis_project"

/content/drive/MyDrive/thesis_project


# Extracting embeddings for authors

In [None]:
import pykeen
import torch
from typing import List
import pykeen.nn

model = torch.load('DistMultText/trained_model.pkl', map_location=torch.device('cpu'))
entity_representation_modules: List['pykeen.nn.RepresentationModule'] = model.entity_representations
entity_embeddings: pykeen.nn.Embedding = entity_representation_modules[0]

In [None]:
print(entity_embeddings)

Embedding(
  (_embeddings): Embedding(87942, 200)
)


In [None]:
with open('OC-197K/authors_lst.txt', 'r') as f:
  authors_lst = f.read().split("\n")

In [None]:
import json
with open('entity_to_id.json') as json_file:
    entity_to_ids = json.load(json_file)

**Note**: The entity_to_id file was truncated

In [None]:
entity_idx = torch.tensor([entity_to_ids[author] for author in authors_lst if author in entity_to_ids.keys()], dtype=torch.long)

In [None]:
entity_embedding_tensor: torch.FloatTensor = entity_embeddings(indices=entity_idx)

# Indexing embeddings

In [None]:
idx_to_embedding = dict()
for idx, emb in zip(entity_idx, entity_embedding_tensor):
  idx_to_embedding[idx.item()] = emb.detach().numpy()

In [None]:
x = entity_embedding_tensor.detach().numpy()

[[-0.00094988  0.05520177 -0.00016945 ...  0.01039909  0.07833451
   0.04485578]
 [-0.03078685 -0.04847596 -0.02065092 ...  0.02299498 -0.02178585
  -0.00049295]
 [-0.00469111 -0.03530772 -0.04240413 ... -0.06444816 -0.02982111
   0.00555742]
 ...
 [-0.05155705 -0.01332404 -0.07428204 ...  0.00253052 -0.01312999
  -0.01214385]
 [-0.05157559  0.06485499 -0.06110098 ... -0.02816361 -0.02230935
   0.0004066 ]
 [-0.07957371 -0.02647611 -0.05301774 ...  0.00108645 -0.00355948
   0.00784788]]


# A little experiment: M Bonitz

In [None]:
m_bonitz_lst = ["https://github.com/arcangelo7/time_agnostic/ar/4293", "https://github.com/arcangelo7/time_agnostic/ar/21727", "https://github.com/arcangelo7/time_agnostic/ar/79750", "https://github.com/arcangelo7/time_agnostic/ar/91541", "https://github.com/arcangelo7/time_agnostic/ar/45318", "https://github.com/arcangelo7/time_agnostic/ar/40867", "https://github.com/arcangelo7/time_agnostic/ar/4604", "https://github.com/arcangelo7/time_agnostic/ar/47672", "https://github.com/arcangelo7/time_agnostic/ar/3109", "https://github.com/arcangelo7/time_agnostic/ar/3935", "https://github.com/arcangelo7/time_agnostic/ar/236021", "https://github.com/arcangelo7/time_agnostic/ar/4561", "https://github.com/arcangelo7/time_agnostic/ar/91538", "https://github.com/arcangelo7/time_agnostic/ar/13251", "https://github.com/arcangelo7/time_agnostic/ar/91536", "https://github.com/arcangelo7/time_agnostic/ar/91550", "https://github.com/arcangelo7/time_agnostic/ar/58499", "https://github.com/arcangelo7/time_agnostic/ar/4239", "https://github.com/arcangelo7/time_agnostic/ar/4096", "https://github.com/arcangelo7/time_agnostic/ar/81527"]
m_bonitz_idx = [entity_to_ids[m_bonitz] for m_bonitz in m_bonitz_lst if m_bonitz in entity_to_ids.keys()]

In [None]:
print(len(m_bonitz_idx))

20


In [None]:
import numpy as np
emb_lst = np.array([idx_to_embedding[idx] for idx in m_bonitz_idx])
print(emb_lst.shape)

(20, 200)


## Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(emb_lst)
print('pairwise dense output:\n {}\n'.format(similarities))

pairwise dense output:
 [[ 9.99999762e-01  5.35902560e-01  6.12160861e-01  1.61125839e-01
   7.79641211e-01  6.84487045e-01  7.29715347e-01  2.91978538e-01
   7.35030115e-01  2.89619654e-01  5.58801174e-01  6.14904463e-01
  -3.98614034e-02  5.74889541e-01  4.60767485e-02  2.12426946e-01
   6.80274248e-01  2.95095503e-01  5.62077165e-01  8.17941189e-01]
 [ 5.35902560e-01  9.99999881e-01  3.98378193e-01  1.17634416e-01
   1.97014198e-01  5.15861690e-01  5.04631639e-01 -1.97399899e-01
   1.29650459e-01 -3.44665125e-02  8.71362686e-01  3.69338661e-01
  -5.33155084e-01  5.56704521e-01  1.19720241e-02 -2.84949754e-04
   3.63798380e-01 -7.26283193e-02  4.16106403e-01  5.02500176e-01]
 [ 6.12160861e-01  3.98378193e-01  1.00000000e+00  7.79682919e-02
   5.48287451e-01  5.05101204e-01  4.77649659e-01  1.01972856e-01
   5.44034958e-01  2.97589421e-01  3.01570773e-01  3.14630300e-01
   2.53922462e-01  8.75451267e-01  1.64603382e-01  2.30208308e-01
   2.87186384e-01  8.46683532e-02  4.77678716e-01 

seems that cosine is not that representative

## Euclidean distance

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
dist = euclidean_distances(emb_lst)

In [None]:
print(dist)

[[0.         0.6281145  0.5408023  0.7386723  0.41561425 0.479495
  0.43847334 0.7488421  0.43512672 0.76312774 0.57025224 0.5139085
  0.81341946 0.5702995  0.8022864  0.81494147 0.5075528  0.7633103
  0.5625248  0.3770957 ]
 [0.6281145  0.         0.6963335  0.7865924  0.81847644 0.6149767
  0.61168456 1.0040653  0.8049446  0.94862795 0.3289317  0.6777561
  1.0178614  0.6028369  0.84618664 0.94522244 0.73818153 0.9696655
  0.6726895  0.58492935]
 [0.5408023  0.6963335  0.         0.74471545 0.57734907 0.5774744
  0.57978827 0.8184075  0.5438936  0.7376496  0.6928211  0.65214485
  0.6660623  0.29914448 0.7248121  0.7838768  0.7360931  0.84517795
  0.5920884  0.4746785 ]
 [0.7386723  0.7865924  0.74471545 0.         0.6924959  0.69489706
  0.77837026 0.8664932  0.6210057  0.93438834 0.7145766  0.76543623
  0.67649907 0.80289704 0.6033485  0.59888756 0.9020459  0.89563584
  0.7183959  0.60770684]
 [0.41561425 0.81847644 0.57734907 0.6924959  0.         0.4239142
  0.5993507  0.7355778  0

Distance of 0.8 seems a good threshold

# Hierarchical clustering with euclidean distance

Let's cluster some blocks

In [None]:
from sklearn.cluster import AgglomerativeClustering
result = AgglomerativeClustering(n_clusters = None, affinity="euclidean", compute_full_tree=True, distance_threshold=0.85).fit(emb_lst)

In [None]:
result.labels_

array([1, 0, 1, 3, 1, 1, 0, 2, 1, 2, 0, 0, 3, 1, 3, 3, 0, 2, 1, 1])

In [None]:
for x, y in zip(m_bonitz_lst, result.labels_):
  print(x, y)

https://github.com/arcangelo7/time_agnostic/ar/4293 1
https://github.com/arcangelo7/time_agnostic/ar/21727 0
https://github.com/arcangelo7/time_agnostic/ar/79750 1
https://github.com/arcangelo7/time_agnostic/ar/91541 3
https://github.com/arcangelo7/time_agnostic/ar/45318 1
https://github.com/arcangelo7/time_agnostic/ar/40867 1
https://github.com/arcangelo7/time_agnostic/ar/4604 0
https://github.com/arcangelo7/time_agnostic/ar/47672 2
https://github.com/arcangelo7/time_agnostic/ar/3109 1
https://github.com/arcangelo7/time_agnostic/ar/3935 2
https://github.com/arcangelo7/time_agnostic/ar/236021 0
https://github.com/arcangelo7/time_agnostic/ar/4561 0
https://github.com/arcangelo7/time_agnostic/ar/91538 3
https://github.com/arcangelo7/time_agnostic/ar/13251 1
https://github.com/arcangelo7/time_agnostic/ar/91536 3
https://github.com/arcangelo7/time_agnostic/ar/91550 3
https://github.com/arcangelo7/time_agnostic/ar/58499 0
https://github.com/arcangelo7/time_agnostic/ar/4239 2
https://github.

In [None]:
from sklearn.cluster import AgglomerativeClustering
result2 = AgglomerativeClustering(n_clusters = None, affinity="euclidean", compute_full_tree=True, distance_threshold=1.4).fit(emb_lst)
for x, y in zip(m_bonitz_lst, result2.labels_):
  print(x, y)

https://github.com/arcangelo7/time_agnostic/ar/4293 0
https://github.com/arcangelo7/time_agnostic/ar/21727 0
https://github.com/arcangelo7/time_agnostic/ar/79750 0
https://github.com/arcangelo7/time_agnostic/ar/91541 1
https://github.com/arcangelo7/time_agnostic/ar/45318 0
https://github.com/arcangelo7/time_agnostic/ar/40867 0
https://github.com/arcangelo7/time_agnostic/ar/4604 0
https://github.com/arcangelo7/time_agnostic/ar/47672 0
https://github.com/arcangelo7/time_agnostic/ar/3109 0
https://github.com/arcangelo7/time_agnostic/ar/3935 0
https://github.com/arcangelo7/time_agnostic/ar/236021 0
https://github.com/arcangelo7/time_agnostic/ar/4561 0
https://github.com/arcangelo7/time_agnostic/ar/91538 1
https://github.com/arcangelo7/time_agnostic/ar/13251 0
https://github.com/arcangelo7/time_agnostic/ar/91536 1
https://github.com/arcangelo7/time_agnostic/ar/91550 1
https://github.com/arcangelo7/time_agnostic/ar/58499 0
https://github.com/arcangelo7/time_agnostic/ar/4239 0
https://github.

1.4 is the threshold that perfectly splits the author's block.