# Add Embeddings for Publications
[work in progress]

In [1]:
import os
import shutil
import pandas as pd
from typing import Dict, List
import json
import requests

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)

In [3]:
KG_PATH = "../kg/data"

## Calculate Embeddings for Publications
A joint embedding of the publication title and abstract is calculated using the [SPECTER](https://github.com/allenai/specter) [public API](https://github.com/allenai/paper-embedding-public-apis).

In [4]:
data_file_primary = os.path.join(KG_PATH, "nodes", "Publication_primary.csv")

In [5]:
publications_primary = pd.read_csv(data_file_primary, dtype=str)
publications_primary.head()

Unnamed: 0,id,name,abstract,journal,year,type,doi,pmId,pmcId,url,embedding
0,doi:10.3390/bios12110938,An Experimental Framework for Developing Point...,Biolayer interferometry (BLI) is a well-establ...,Biosensors,2022,primary,doi:10.3390/bios12110938,36354449,PMC9688365,https://doi.org/doi:10.3390/bios12110938,-1.9157365560531616|-4.975357532501221|-1.2034...
1,doi:10.3389/fsens.2022.917380,Development of a Biosensor Based on Angiotensi...,Severe acute respiratory syndrome coronavirus ...,Frontiers in sensors,2022,primary,doi:10.3389/fsens.2022.917380,35992634,PMC9386735,https://doi.org/doi:10.3389/fsens.2022.917380,1.454939603805542|-4.036255359649658|1.2001737...
2,doi:10.3390/bios12020101,Context-Aware Diagnostic Specificity (CADS).,Rapid detection of proteins is critical in a v...,Biosensors,2022,primary,doi:10.3390/bios12020101,35200361,PMC8869940,https://doi.org/doi:10.3390/bios12020101,2.5686216354370117|-6.731689453125|2.812827825...
3,doi:10.1021/acssensors.3c00512,Rapid Direct Detection of SARS-CoV-2 Aerosols ...,Airborne transmission via virus-laden aerosols...,ACS sensors,2023,primary,doi:10.1021/acssensors.3c00512,37498298,PMC10463275,https://doi.org/doi:10.1021/acssensors.3c00512,-4.168727874755859|-4.673065662384033|-0.37046...
4,doi:10.1038/s41467-023-39419-z,Real-time environmental surveillance of SARS-C...,Real-time surveillance of airborne SARS-CoV-2 ...,Nature communications,2023,primary,doi:10.1038/s41467-023-39419-z,37429842,PMC10333287,https://doi.org/doi:10.1038/s41467-023-39419-z,-0.7667260766029358|-2.832000970840454|1.14964...


In [6]:
# create a JSON object with the fields required by the SPECTER API
publications_part = publications_primary[["id", "name", "abstract"]].copy()
publications_part.rename(columns={"id": "paper_id", "name": "title"}, inplace=True)
publications_dict = json.loads(publications_part.to_json(orient="records"))

In [7]:
URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16

In [8]:
def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    """Splits a longer list to respect batch size"""
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]

In [9]:
def embed(papers):
    embeddings_by_paper_id: Dict[str, List[float]] = {}

    for chunk in chunks(papers):
        # Allow Python requests to convert the data above to JSON
        response = requests.post(URL, json=chunk)

        if response.status_code != 200:
            raise RuntimeError("Sorry, something went wrong, please try later!")

        for paper in response.json()["preds"]:
            embeddings_by_paper_id[paper["paper_id"]] = paper["embedding"]

    return embeddings_by_paper_id

In [10]:
# reformat embeddings into a vertical bar separate array

In [11]:
embeddings = pd.DataFrame(embed(publications_dict)).transpose()
embeddings["embedding"] = [[e for e in row if isinstance(e, float) ] for row in embeddings.values.tolist()]
embeddings["embedding"] = embeddings["embedding"].astype(str)
# remove brackets from Python list
embeddings["embedding"] = embeddings["embedding"].str.replace("[", "")
embeddings["embedding"] = embeddings["embedding"].str.replace("]", "")
# replace comma with vertical bar delimiter
embeddings["embedding"] = embeddings["embedding"].str.replace(", ", "|")
embeddings = embeddings.reset_index(names="id")
embeddings = embeddings[["id", "embedding"]].copy()

In [12]:
embeddings.head()

Unnamed: 0,id,embedding
0,doi:10.3390/bios12110938,-1.9157365560531616|-4.975357532501221|-1.2034...
1,doi:10.3389/fsens.2022.917380,1.454939603805542|-4.036255359649658|1.2001737...
2,doi:10.3390/bios12020101,2.5686216354370117|-6.731689453125|2.812827825...
3,doi:10.1021/acssensors.3c00512,-4.168727874755859|-4.673065662384033|-0.37046...
4,doi:10.1038/s41467-023-39419-z,-0.7667260766029358|-2.832000970840454|1.14964...


In [13]:
# append embeddings to publication nodes
publications_primary = publications_primary.merge(embeddings, on="id")

In [14]:
publications_primary.head()

Unnamed: 0,id,name,abstract,journal,year,type,doi,pmId,pmcId,url,embedding_x,embedding_y
0,doi:10.3390/bios12110938,An Experimental Framework for Developing Point...,Biolayer interferometry (BLI) is a well-establ...,Biosensors,2022,primary,doi:10.3390/bios12110938,36354449,PMC9688365,https://doi.org/doi:10.3390/bios12110938,-1.9157365560531616|-4.975357532501221|-1.2034...,-1.9157365560531616|-4.975357532501221|-1.2034...
1,doi:10.3389/fsens.2022.917380,Development of a Biosensor Based on Angiotensi...,Severe acute respiratory syndrome coronavirus ...,Frontiers in sensors,2022,primary,doi:10.3389/fsens.2022.917380,35992634,PMC9386735,https://doi.org/doi:10.3389/fsens.2022.917380,1.454939603805542|-4.036255359649658|1.2001737...,1.454939603805542|-4.036255359649658|1.2001737...
2,doi:10.3390/bios12020101,Context-Aware Diagnostic Specificity (CADS).,Rapid detection of proteins is critical in a v...,Biosensors,2022,primary,doi:10.3390/bios12020101,35200361,PMC8869940,https://doi.org/doi:10.3390/bios12020101,2.5686216354370117|-6.731689453125|2.812827825...,2.5686216354370117|-6.731689453125|2.812827825...
3,doi:10.1021/acssensors.3c00512,Rapid Direct Detection of SARS-CoV-2 Aerosols ...,Airborne transmission via virus-laden aerosols...,ACS sensors,2023,primary,doi:10.1021/acssensors.3c00512,37498298,PMC10463275,https://doi.org/doi:10.1021/acssensors.3c00512,-4.168727874755859|-4.673065662384033|-0.37046...,-4.168727874755859|-4.673065662384033|-0.37046...
4,doi:10.1038/s41467-023-39419-z,Real-time environmental surveillance of SARS-C...,Real-time surveillance of airborne SARS-CoV-2 ...,Nature communications,2023,primary,doi:10.1038/s41467-023-39419-z,37429842,PMC10333287,https://doi.org/doi:10.1038/s41467-023-39419-z,-0.7667260766029358|-2.832000970840454|1.14964...,-0.7667260766029358|-2.832000970840454|1.14964...


In [15]:
# save publications with embedding added.
publications_primary.to_csv(data_file_primary, index=False)

In [16]:
# don't calculate embeddings for secondary publications for now
data_file_secondary = os.path.join(KG_PATH, "nodes", "Publication_secondary.csv")
publications_secondary = pd.read_csv(data_file_secondary, dtype=str)
publications_secondary["embedding"] = ""
# save publications with embedding added.
publications_secondary.to_csv(data_file_secondary, index=False)