# Download OpenAlex data

This notebook downloads the OpenAlex data for the ORKG project.

In [2]:
import os

base_dir = "data/orkg/"

In [3]:
import json

# Read name.json data in directory
def read_json(filename: str):
    json_file = os.path.join(filename)
    with open(json_file) as f:
        img_annotations = json.load(f)

    return img_annotations


# Pretty print json data to console
def print_json(tag: str, data: any):
    print(tag, json.dumps(data, indent=2, sort_keys=True))


# Pretty print json data to file
def write_json(filename: str, data: any):
    with open(filename, "w") as file:
        json.dump(data, file, indent=2, sort_keys=True)

In [4]:
import pandas as pd

meta_infos_raw = read_json(os.path.join(base_dir, "meta_infos.json"))
meta_infos = pd.DataFrame.from_dict(meta_infos_raw)
meta_infos

Unnamed: 0,author,doi,paper_ids,publication month,publication year,publisher,title,url
0,"[J Wu, H Huang]",10.1109/vetecf.2005.1558414,R26672,,2005,,A probabilistic clustering algorithm in wirele...,
1,"[William A Sethares, Chih-Yu Wen]",10.1155/wcn.2005.686,R26676,12,2005,,Automatic Decentralized Clustering for Wireles...,
2,"[A William, Sethares, Chih-Yu Wen]",10.1109/jsen.2013.2249659,R26679,,2013,,Distributed clustering with directional antenn...,
3,"[Xiaozong Yang, Yoohwan Kim, Ling Wang, Yan Jin]",10.1016/j.comnet.2007.10.005,R26682,2,2008,,EEMC: An energy-efficient multi-level clusteri...,
4,"[A Savvides, R Virrankoski]",10.1109/mahss.2005.1542850,R26687,,,,TASC: topology adaptive spatial clustering for...,
...,...,...,...,...,...,...,...,...
26838,David Nolte,10.1026/2191-9186/a000176,R576013,10,2014,Frühe Bildung,Eine Frage der Medienkompetenz?: Bedingungen m...,http://dx.doi.org/10.1026/2191-9186/a000176
26839,"[Rath, M, Marci-Boehncke, G. ]",,R576016,,2013,,Kinder-Medien-Bildung: eine Studie zu Medienko...,
26840,"[Anja Pielsticker, K. Keller, Henrike Friedric...",,R576018,,2012,,Chancen und Potenziale digitaler Medien zur Um...,
26841,"[Niels Brüggen, Valerie Jochim, A. Oberlinner,...",,R576020,,2018,,Digitale Medien in Kindertageseinrichtungen: M...,


In [5]:
import sqlite3

database_path = os.path.join(base_dir, "datalake.db")

# Open a connection to the SQLite database file
db = sqlite3.connect(database_path)

# Create a cursor object to execute SQL statements
cursor = db.cursor()

In [6]:
cursor.execute("CREATE TABLE IF NOT EXISTS openalex (id INTEGER PRIMARY KEY, title TEXT, data JSON)")

def openalex_exists(title: str):
    cursor.execute("SELECT * FROM openalex WHERE title = ?",
                   (title,))
    rows = cursor.fetchall()
    return len(rows) > 0


def store_openalex_data(title: str, data: dict):
    cursor.execute("INSERT INTO openalex (title, data) VALUES (?, ?)",
                   (title, json.dumps(data)))
    db.commit()

In [7]:
from pyalex import Works

title = "Automatic Decentralized Clustering for Wireless Sensor Networks"
# Causes error with ','
# title = "Purification, Characterization, and Gene Analysis of a Chitosanase"

openalex_data = Works().search_filter(title=title).get()
print_json("openalex_data", openalex_data)

openalex_data [
  {
    "abstract_inverted_index": {
      "Each": [
        14
      ],
      "Simplified": [
        62
      ],
      "The": [
        38,
        82
      ],
      "We": [
        0
      ],
      "a": [
        2,
        17,
        29,
        35,
        42,
        60
      ],
      "ad": [
        8
      ],
      "algorithm": [
        4,
        39,
        79,
        86
      ],
      "an": [
        7
      ],
      "analytically": [
        89
      ],
      "and": [
        21,
        48,
        73,
        90
      ],
      "are": [
        64,
        80
      ],
      "asynchronously,": [
        47
      ],
      "be": [
        58
      ],
      "centralized": [
        43
      ],
      "cluster": [
        31
      ],
      "cluster.": [
        37
      ],
      "clusters": [
        71
      ],
      "clusters.": [
        13
      ],
      "controller,": [
        44
      ],
      "criteria": [
        23
      ],
      "current": [
       

In [8]:
from tqdm import tqdm

skip_count = 0
missed_count = 0
for index, row in tqdm(meta_infos.iterrows(), total=len(meta_infos)):
    doi = row["doi"]
    title = row["title"]

    # If doi is not a string, skip
    if not isinstance(title, str) or len(title) == 0:
        skip_count += 1
        # print(f"Skipping {index}: {title} ({doi})")
        continue

    if openalex_exists(title):
        continue

    # print(f"Processing {index}: {title}")

    cleaned_title = title.replace(",", "")
    cleaned_title = cleaned_title.replace("&", " ")
    cleaned_title = cleaned_title.replace("\"", "")

    try:
        data = Works().search_filter(title=cleaned_title).get()
    except Exception as e:
        data = None
        print(f"Error with title: {title}")

    if data is not None and len(data) > 0:
        store_openalex_data(title, data)
    else:
        missed_count += 1
        # print(f"Could not find {index}: {title} ({doi})")

print(f"Skipped {skip_count} entries due to missing title.")
print(f"Could not find {missed_count} entries.")

 95%|█████████▍| 25433/26843 [30:01<10:49:02, 27.62s/it]

Error with title: ProFlow: Learning to Predict Optical Flow


100%|██████████| 26843/26843 [41:03<00:00, 10.90it/s]   

Skipped 8 entries due to missing title.
Could not find 1872 entries.





In [9]:
db.close()