# UMAP of multiple embeddings

This notebook is for running UMAP on different embeddings of _the same_ set of papers

NB: in our previous iterations of UMAP, each embedding had a different set of sample papers

In [1]:
import os
import pandas as pd
import umap

# Load sample

In [2]:
SAMPLE_PATH = "data/derived/umap/sample_aps_all_2010_abstract_doc2vec_umap.csv" # replace with data folder

In [3]:
sample_df = pd.read_csv(SAMPLE_PATH).drop_duplicates(subset="DOI").set_index("DOI")
sample_df["PACS_CODE_1"] = sample_df["PACS_CODE_1"].astype(str)
sample_df = sample_df[["PACS_CODE_1"]] # this is all we really need
sample_df = sample_df.sample(n=25000)

In [4]:
len(sample_df)

25000

# Load embeddings, run UMAP

In [5]:
EMB_SPEC = EMB_SPEC = ["undirected_leigenmap", "undirected_node2vec", "undirected_residual2vec", "title_doc2vec", "title_scibert", "title_sentencebert", "abstract_doc2vec", "abstract_scibert", "abstract_sentencebert"]
EMB_PATH = "data/embedding/vectors/aps_all_2010_{}_vector.gz" # replace with data folder
umap_dict = {}

for e in EMB_SPEC:
    emb = pd.read_csv(EMB_PATH.format(e), header=0, index_col="DOI").sort_index()
    DIM = emb.shape[1]
    emb.columns = list(range(0,DIM))
    emb = emb.merge(sample_df, left_index=True, right_index=True)
    
    reducer = umap.UMAP()
    umap_embedding = reducer.fit_transform(emb[list(range(0,DIM))].values)
    umap_df = pd.DataFrame(umap_embedding)
    umap_df.index = emb.index
    umap_df["PACS_CODE_1"] = emb["PACS_CODE_1"]
    umap_df = umap_df.rename(columns={0:"x", 1:"y"}).sort_index()
    umap_dict[e] = umap_df

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../opt/miniconda3/envs/embed-sci-disc/lib/python3.7/site-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../opt/miniconda3/envs/embed-sci-disc/lib/python3.7/site-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, 

In [6]:
umap_dict["title_scibert"].columns

Index(['x', 'y', 'PACS_CODE_1'], dtype='object')

In [7]:
umap_dict.keys()

dict_keys(['undirected_leigenmap', 'undirected_node2vec', 'undirected_residual2vec', 'title_doc2vec', 'title_scibert', 'title_sentencebert', 'abstract_doc2vec', 'abstract_scibert', 'abstract_sentencebert'])

In [8]:
[len(v) for v in umap_dict.values()]

[24959, 24959, 24959, 25000, 25000, 25000, 25009, 25009, 25009]

In [9]:
umap_all = umap_dict["undirected_leigenmap"].merge(umap_dict["undirected_node2vec"][["x","y"]], left_index=True, right_index=True, suffixes=["_undirected_leigenmap", "_undirected_node2vec"])
umap_all = umap_all.merge(umap_dict["undirected_residual2vec"][["x","y"]], left_index=True, right_index=True).rename(columns={"x":"x_undirected_residual2vec", "y":"y_undirected_residual2vec"})
umap_all = umap_all.merge(umap_dict["title_doc2vec"][["x","y"]], left_index=True, right_index=True).rename(columns={"x":"x_title_doc2vec", "y":"y_title_doc2vec"})
umap_all = umap_all.merge(umap_dict["title_scibert"][["x","y"]], left_index=True, right_index=True).rename(columns={"x":"x_title_scibert", "y":"y_title_scibert"})
umap_all = umap_all.merge(umap_dict["title_sentencebert"][["x","y"]], left_index=True, right_index=True).rename(columns={"x":"x_title_sentencebert", "y":"y_title_sentencebert"})
umap_all = umap_all.merge(umap_dict["abstract_doc2vec"][["x","y"]], left_index=True, right_index=True).rename(columns={"x":"x_abstract_doc2vec", "y":"y_abstract_doc2vec"})
umap_all = umap_all.merge(umap_dict["abstract_scibert"][["x","y"]], left_index=True, right_index=True).rename(columns={"x":"x_abstract_scibert", "y":"y_abstract_scibert"})
umap_all = umap_all.merge(umap_dict["abstract_sentencebert"][["x","y"]], left_index=True, right_index=True).rename(columns={"x":"x_abstract_sentencebert", "y":"y_abstract_sentencebert"})

In [10]:
umap_all

Unnamed: 0_level_0,x_undirected_leigenmap,y_undirected_leigenmap,PACS_CODE_1,x_undirected_node2vec,y_undirected_node2vec,x_undirected_residual2vec,y_undirected_residual2vec,x_title_doc2vec,y_title_doc2vec,x_title_scibert,y_title_scibert,x_title_sentencebert,y_title_sentencebert,x_abstract_doc2vec,y_abstract_doc2vec,x_abstract_scibert,y_abstract_scibert,x_abstract_sentencebert,y_abstract_sentencebert
DOI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10.1103/PhysRevA.43.1395,2.771799,0.302231,3,3.440551,2.434962,5.056444,4.631864,0.411668,-1.935952,2.689642,9.388043,2.015839,0.917281,1.764385,-6.343216,-3.837918,4.514092,0.513185,-2.827545
10.1103/PhysRevA.43.1940,-2.386303,-2.660287,6,-7.101846,-3.803667,0.148662,5.150470,0.775899,-2.014493,1.339793,8.091738,-2.205557,-0.826935,4.670600,0.629041,0.521890,-3.152586,-1.042538,-0.942555
10.1103/PhysRevA.43.2416,2.669961,1.065399,4,0.438891,-2.820272,-0.372632,-6.245992,1.739064,-1.980143,-9.822905,2.783962,5.014445,-1.796701,1.157394,-6.454245,7.548762,-2.296276,-4.567186,1.654765
10.1103/PhysRevA.43.2910,4.013923,-1.494711,6,-6.417593,-4.043773,0.732498,4.582602,-0.641927,-1.890028,-6.002483,6.039698,0.868945,-4.649810,0.043193,-3.371896,-0.724192,-1.167021,-0.006781,-0.639512
10.1103/PhysRevA.43.2943,2.272212,-0.722318,6,-7.313387,-3.739641,0.388965,5.180825,1.618954,-2.053705,3.921473,-0.870297,0.435673,-1.051684,2.807083,-5.496268,3.681046,-3.348199,-0.733416,-0.859349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10.1103/RevModPhys.82.2053,4.512861,1.854595,9,10.875419,-1.288938,11.012065,-1.954075,-4.520244,0.414247,4.896106,-5.800956,2.202550,-0.047699,2.881742,-5.488807,5.633334,-2.552500,0.046990,-2.497333
10.1103/RevModPhys.82.2489,3.580409,-0.488596,1,10.065157,0.762524,10.341672,1.117400,-3.219315,1.346294,4.586003,-5.395843,-0.049698,2.956926,3.179528,-5.184672,3.884021,-2.922952,2.628636,-1.227247
10.1103/RevModPhys.82.53,5.488572,1.991396,7,-4.153357,4.329761,-9.765863,-0.448298,-0.747235,1.909193,4.906669,-6.688659,3.792815,-3.440102,1.506912,-6.145723,0.263464,0.039308,-4.088583,-1.680792
10.1103/RevModPhys.82.603,1.720691,-2.673748,9,-0.380794,-6.431448,3.694945,-0.239124,-3.991476,0.434010,3.839941,-7.401066,1.333655,-3.808608,1.739916,-5.758469,1.318917,-1.855923,-1.365756,-0.187496


# Save

In [11]:
umap_all.to_csv("sample_umap_all_embs.csv", index=True, header=True)