# Add abstracts to ORKG data

This notebook adds abstracts to the ORKG data and creates the dataframe `orkg_with_abstracts.csv`.
It uses the following sources:
- [Crossref](https://www.crossref.org/)
- [OpenAlex](https://openalex.org/)
- [Semantics Scholar](https://www.semanticscholar.org/)

In [None]:
import json

# Pretty print json data to console
def print_json(tag: str, data: any):
    print(tag, json.dumps(data, indent=2, sort_keys=True))

In [None]:
import sqlite3

# Open a connection to the SQLite database file
db = sqlite3.connect("data/datalake.db")

# Create a cursor object to execute SQL statements
cursor = db.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS crossrefs (id INTEGER PRIMARY KEY, doi TEXT, title Text, data JSON)")
cursor.execute("CREATE TABLE IF NOT EXISTS openalex (id INTEGER PRIMARY KEY, title TEXT, data JSON)")
cursor.execute("CREATE TABLE IF NOT EXISTS semantics (id INTEGER PRIMARY KEY, doi TEXT, data JSON)")

In [None]:
def get_crossref_data(doi: str, title: str):
    cursor.execute("SELECT data FROM crossrefs WHERE doi = ? or title = ?", (doi, title))
    data = cursor.fetchone()
    return json.loads(data[0]) if data else None

def get_openalex_data(title: str):
    cursor.execute("SELECT data FROM openalex WHERE title = ?", (title,))
    data = cursor.fetchone()
    return json.loads(data[0]) if data else None

def get_semantics_data(doi: str):
    cursor.execute("SELECT data FROM semantics WHERE doi = ?", (doi,))
    data = cursor.fetchone()
    return json.loads(data[0]) if data else None

In [None]:
import re

def clean_abstract(text: str) -> str:
    # Remove xml tags
    text = re.sub(r"<.*?>", "", text)

    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    text = text.replace("\t", " ")
    text = text.replace("  ", " ")
    text = text.strip()

    return text

In [None]:
import pandas as pd

df = pd.read_csv("data/orkg.csv")
df["doi"] = df.doi.apply(eval).apply(list)  # convert string to array
df["subfields"] = df.subfields.apply(eval).apply(list)  # convert string to array
df

In [None]:
from tqdm import tqdm

abstracts = []

for inx, row in tqdm(df.iterrows(), total=len(df)):
    title = row["title"]

    doi = ""
    abstract = ""

    for di in row["doi"]:
        doi = di

        # Get data from semantics scholar with doi
        semantics = get_semantics_data(doi)
        if semantics and semantics["abstract"]:
            abstract = clean_abstract(semantics["abstract"])

        if abstract != "":
            break

        # Get data from crossref with doi
        crossref = get_crossref_data(doi, title)
        if crossref and "abstract" in crossref:
            abstract = clean_abstract(crossref["abstract"])

        if abstract != "":
            break

    if abstract == "":
        # Get data from crossref with title
        crossref = get_crossref_data("", title)
        if crossref and "abstract" in crossref:
            abstract = clean_abstract(crossref["abstract"])

    abstracts.append(abstract)

In [None]:
df["abstract"] = abstracts

# Replace abstracts that are shorter than 10 characters with empty string
df.loc[df["abstract"].str.len() < 10, "abstract"] = ""

df.to_csv("data/orkg_with_abstracts.csv", index=False)
df