# Download Crossref Data

This notebook downloads data from the Crossref API and stores it in a SQLite database.

In [2]:
import os

base_dir = "data/orkg/"

In [3]:
import json

# Read name.json data in directory
def read_json(filename: str):
    json_file = os.path.join(filename)
    with open(json_file) as f:
        img_annotations = json.load(f)

    return img_annotations


# Pretty print json data to console
def print_json(tag: str, data: any):
    print(tag, json.dumps(data, indent=2, sort_keys=True))


# Pretty print json data to file
def write_json(filename: str, data: any):
    with open(filename, "w") as file:
        json.dump(data, file, indent=2, sort_keys=True)

In [4]:
import pandas as pd

meta_infos_raw = read_json(os.path.join(base_dir, "meta_infos.json"))
meta_infos = pd.DataFrame.from_dict(meta_infos_raw)
meta_infos

Unnamed: 0,author,doi,paper_ids,publication month,publication year,publisher,title,url
0,"[J Wu, H Huang]",10.1109/vetecf.2005.1558414,R26672,,2005,,A probabilistic clustering algorithm in wirele...,
1,"[William A Sethares, Chih-Yu Wen]",10.1155/wcn.2005.686,R26676,12,2005,,Automatic Decentralized Clustering for Wireles...,
2,"[A William, Sethares, Chih-Yu Wen]",10.1109/jsen.2013.2249659,R26679,,2013,,Distributed clustering with directional antenn...,
3,"[Xiaozong Yang, Yoohwan Kim, Ling Wang, Yan Jin]",10.1016/j.comnet.2007.10.005,R26682,2,2008,,EEMC: An energy-efficient multi-level clusteri...,
4,"[A Savvides, R Virrankoski]",10.1109/mahss.2005.1542850,R26687,,,,TASC: topology adaptive spatial clustering for...,
...,...,...,...,...,...,...,...,...
26838,David Nolte,10.1026/2191-9186/a000176,R576013,10,2014,Frühe Bildung,Eine Frage der Medienkompetenz?: Bedingungen m...,http://dx.doi.org/10.1026/2191-9186/a000176
26839,"[Rath, M, Marci-Boehncke, G. ]",,R576016,,2013,,Kinder-Medien-Bildung: eine Studie zu Medienko...,
26840,"[Anja Pielsticker, K. Keller, Henrike Friedric...",,R576018,,2012,,Chancen und Potenziale digitaler Medien zur Um...,
26841,"[Niels Brüggen, Valerie Jochim, A. Oberlinner,...",,R576020,,2018,,Digitale Medien in Kindertageseinrichtungen: M...,


In [5]:
import sqlite3

database_path = os.path.join(base_dir, "datalake.db")

# Open a connection to the SQLite database file
db = sqlite3.connect(database_path)

# Create a cursor object to execute SQL statements
cursor = db.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS crossrefs (id INTEGER PRIMARY KEY, doi TEXT, title Text, data JSON)")

# cursor.execute("SELECT * FROM users")
# rows = cursor.fetchall()
#
# for row in rows:
#     id, name = row
#     # Process the retrieved data
#
# # Commit the changes and close the connection
# conn.commit()
# conn.close()

<sqlite3.Cursor at 0x7f2df931c8f0>

In [6]:
import requests
from fuzzywuzzy import fuzz

def crossref_by_doi(doi: str):
    url = "https://api.crossref.org/works/" + doi
    response = requests.get(url, timeout=5)

    if not response.ok:
        return None

    message = response.json()["message"]
    return message

def crossref_by_title(title: str):
    url = "https://api.crossref.org/works?query.bibliographic=" + title
    response = requests.get(url, timeout=5)

    if not response.ok:
        return None

    message = response.json()["message"]

    for item in message.get('items', []):
        if not item.get("title", ""):
            continue

        if title.lower() == item.get('title', '')[0].lower():
            return item
        elif fuzz.ratio(title.lower(), item.get('title', '')[0].lower()) > 95:
            return item

    return None

In [7]:
def crossref_exists(doi: str, title: str):
    cursor.execute("SELECT * FROM crossrefs WHERE doi = ? OR title = ?",
                   (doi, title))
    rows = cursor.fetchall()
    return len(rows) > 0

def store_crossref_data(doi: str, title: str, data: dict):
    cursor.execute("INSERT INTO crossrefs (doi, title, data) VALUES (?, ?, ?)", (doi, title, json.dumps(data)))
    db.commit()

In [10]:
from tqdm import tqdm

for index, row in tqdm(meta_infos.iterrows(), total=len(meta_infos)):
    doi = row["doi"]
    title = row["title"]

    # If doi is not a string, skip
    if not isinstance(doi, str):
        print(f"Skipping {index}: {title} ({doi})")
        continue

    if crossref_exists(doi, title):
        continue

    # print(f"Processing {index}: {title} ({doi})")

    if doi is not None:
        data = crossref_by_doi(doi)
    else:
        data = crossref_by_title(title)

    if data is not None:
        store_crossref_data(doi, title, data)
    else:
        print(f"Could not find {index}: {title} ({doi})")

    # break

  0%|          | 119/26843 [00:01<07:59, 55.73it/s]

Could not find 109: A clustering method for energy efficient routing in wireless sensor networks (10.5555/1353572.1353596)


  2%|▏         | 596/26843 [00:06<07:08, 61.28it/s] 

Could not find 579: Wie weit fliegt ein Skispringer, der mit 72 km/h abspringt? (10.5446/40456)
Could not find 601: Implementing LOINC – Current Status and Ongoing Work at a Medical University (10.3233/SHTI190806)


  2%|▏         | 606/26843 [00:07<15:47, 27.70it/s]

Could not find 602: Selected Approaches Ranking Contextual Term for the BioASQ Multi-Label Classification (Task6a and 7a) (10.13140/rg.2.2.31648.33288)


  2%|▏         | 625/26843 [00:07<13:41, 31.92it/s]

Could not find 608: Chemical and mineral compositions of sediments from ODP Site 127-797, supplement to: Irino, Tomohisa; Tada, Ryuji (2000): Quantification of aeolian dust (Kosa) contribution to the Japan Sea sediments and its variation during the last 200 ky. Geochemical Journal, 34(1), 59-93 (10.1594/pangaea.726855)


  3%|▎         | 875/26843 [00:09<03:45, 115.06it/s]

Skipping 856: Twin-Control. A Digital Twin Approach to Improve Machine Tools Lifecycle (['10.1007/978-3-030-02203-7', 'https://doi.org/10.1007/978-3-030-02203-7'])


  3%|▎         | 911/26843 [00:10<03:46, 114.40it/s]

Could not find 920: Politik der kleinen Form (10.18452/19931)


  3%|▎         | 933/26843 [00:11<10:56, 39.49it/s] 

Could not find 921: Politik der kleinen Form (10.18452/19931)


  6%|▋         | 1711/26843 [00:18<08:03, 52.00it/s] 

Could not find 1701: Enriching Knowledge Bases with Interesting Negative Statements (10.24432/C5101K)


  7%|▋         | 1843/26843 [00:20<08:08, 51.18it/s] 

Could not find 1833: Disturbance Facilitates Invasion: The Effects Are Stronger Abroad than at Home (10.2307/3844721)


  9%|▊         | 2285/26843 [00:24<07:55, 51.66it/s] 

Could not find 2282: More Readers in More Places: The benefits of open access for scholarly books (10.5281/zenodo.4014905)


  9%|▊         | 2304/26843 [00:25<09:27, 43.22it/s]

Could not find 2289: Let's ROR Together: Building an Open Registry of Research Organizations (10.5281/ZENODO.2691307)


  9%|▊         | 2312/26843 [00:25<13:03, 31.32it/s]

Could not find 2310: N,N-Dimethylformamide. MAK Value Documentation, supplement – Translation of the German version from 2019  (10.34865/MB6812E5_1)
Could not find 2312:  1,4-Dioxane. MAK Value Documentation, supplement – Translation of the German version from 2019  (10.34865/MB12391E5_1)
Could not find 2313: Polytetrafluoroethene. MAK Value Documentation – Translation of the German version from 2019  (10.34865/MB900284E5_1)
Could not find 2314: Furan. MAK Value Documentation, supplement – Translation of the German version from 2017 (10.34865/MB11000E5_2AD)
Could not find 2315: Titanium dioxide (respirable fraction). MAK Value Documentation, supplement – Translation of the German version from 2019  (10.34865/MB1346367E5_1)


  9%|▊         | 2327/26843 [00:27<28:34, 14.30it/s]

Could not find 2317: 2,3-Pentanedione. MAK Value Documentation – Translation of the German version from 2017 (10.34865/MB60014E5_2AD)


  9%|▊         | 2339/26843 [00:27<19:22, 21.08it/s]

Could not find 2340: Antimony and its inorganic compounds - Addendum for evaluation of a BAR. Assessment Values in Biological Material – Translation of the German version from 2020 (10.34865/BB744036E5_1)
Could not find 2342: Chlorinated biphenyls –Addendum for evaluation of a concentration corresponding to an assignment of Pregnancy Risk Group C. Assessment Values in Biological Material – Translation of the German version from 2019 (10.34865/BB133636E5_1)
Could not find 2343: Antimony and its compounds – Determination of antimony species in urine by HPLC-ICP-MS. Biomonitoring Method – Translation of the German version from 2020 (10.34865/BI744036E5_1)


  9%|▊         | 2347/26843 [00:29<36:33, 11.17it/s]

Could not find 2344: Aluminium – Addendum for evaluation of a BAR. Assessment Values in Biological Material – Translation of the German version from 2019 (10.34865/BB742990E5_1)
Could not find 2347: Butylated hydroxytoluene (BHT) – Determination of 3,5 di-tert-butyl-4-hydroxybenzoic acid (BHT acid) in urine by LC-MS/MS. Biomonitoring Method – Translation of the German version from 2020 (10.34865/BI12837E5_1)
Could not find 2348: Nickel and sparingly soluble nickel compounds (Nickel as nickel metal, nickel sulphide, sulphidic ores, nickel oxide, nickel carbonate) - Addendum for re-evaluation of a EKA. Assessment Values in Biological Material – Translation of the German version from 2019  (10.34865/BB744002E5_1)
Could not find 2349: Citric acid – Method for the determination of citric acid in workplace air using high performance liquid chromatography (HPLC). AirMonitoring Method – Translationof the German version from 2019 (10.34865/AM7792E5_1)
Could not find 2350: 1,4-Dioxane. MAK Value

  9%|▉         | 2353/26843 [00:31<59:02,  6.91it/s]

Could not find 2352: 1,4-Dichlorobenzene – Addendum for evaluation of BAT value, BAR and EKA. Assessment Values in Biological Material – Translation of the German version from 2020 (10.34865/BB10646E5_2AD)
Could not find 2353: 1,4-Dioxane – Addendum for re-evaluation of the BAT value. Assessment Values in Biological Material – Translation of the German version from 2020 (10.34865/BB12391E5_2AD)
Could not find 2354: Bisphenol S – Evaluation of a BAR. Assessment Values in Biological Material – Translation of the German version from 2020 (10.34865/BB8009E5_2OR)


  9%|▉         | 2367/26843 [00:33<45:22,  8.99it/s]  

Could not find 2355: Hydrogen cyanide, cyanides and cyanide-releasing compounds – determination of thiocyanate in plasma/serum, urine and saliva by GC-MS. Biomonitoring Method – Translation of the German version from 2020 (10.34865/BI30204E5_2OR)


  9%|▉         | 2385/26843 [00:33<29:02, 14.04it/s]

Could not find 2384: Alcohols, ketones and ethers – Determination of alcohols, ketones and ethers in urine by headspace GC-MS. Biomonitoring Method – Translation of the German version from 2020 (10.34865/BI6756E5_2OR)
Could not find 2385: Polyethylene glycols (PEGs) (average molar mass 200–600). MAK Value Documentation, supplement - Translation of the German version from 2019 (10.34865/MB0PE1KSKE5_3AD)
Could not find 2386:  Glutaric acid. MAK Value Documentation - Translation of the German version from 2019 (10.34865/MB11094E5_3OR)
Could not find 2387:  α-Aluminium oxide (corundum) (respirable fraction). MAK Value Documentation, supplement - Translation of the German version from 2019 (10.34865/MB742990VERE5_3AD)
Could not find 2388: o-Phenylphenol (OPP) and sodium o-phenylphenol (OPP-Na). MAK Value Documentation, supplement - Translation of the German version from 2016 (10.34865/MB9043VERE5_3AD)
Could not find 2389: Azinphos-methyl. MAK Value Documentation, supplement - Translation of

  9%|▉         | 2391/26843 [00:36<1:02:28,  6.52it/s]

Could not find 2390: N,N-Dimethylacetamide – Addendum for re-evaluation of the BAT value. Assessment Values in Biological Material – Translation of the German version from 2020 (10.34865/BB12719E5_3AD)
Could not find 2391: Chlorobenzene – Addendum for re-evaluation of the BAT value. Assessment Values in Biological Material – Translation of the German version from 2019 (10.34865/BB10890E5_3AD)
Could not find 2392: N,N-Dimethylformamide – Addendum for re-evaluation of the BAT value. Assessment Values in Biological Material – Translation of the German version from 2019 (10.34865/BB6812E5_3AD)
Could not find 2393: Pentachlorophenol – Addendum for re-evaluation of EKA. Assessment Values in Biological Material – Translation of the German version from 2019 (10.34865/BB8786E5_3AD)


  9%|▉         | 2395/26843 [00:37<1:23:31,  4.88it/s]

Could not find 2394:  Lindane – Addendum for re-evaluation of the BAT value. Assessment Values in Biological Material – Translation of the German version from 2019 (10.34865/BB5889E5_3AD)
Could not find 2395: 1,1,1-Trichloroethane – Addendum for re-evaluation of the BAT value. Assessment Values in Biological Material – Translation of the German version from 2019 (10.34865/BB7155E5_3AD)
Could not find 2396: Method for the determination of aromatic amines in workplace air using gas chromatography.  Air Monitoring Method  (10.34865/AM9553E5_3)


  9%|▉         | 2398/26843 [00:39<1:37:12,  4.19it/s]

Could not find 2397: 2-Mercaptobenzothiazole – Determination of 2-mercaptobenzothiazole in urine by LC-MS/MS. Biomonitoring Method – Translation of the German version from 2020 (10.34865/BI14930E5_3OR)
Could not find 2398: Method for the determination of cobalt and its compounds in workplace air using atomic absorption spectrometry with the graphite furnace technique (GFAAS) after high-pressure microwave digestion. Air Monitoring Method (10.34865/AM744048E5_3)


  9%|▉         | 2400/26843 [00:39<1:47:26,  3.79it/s]

Could not find 2399: 1,2,5,6,9,10-Hexabromocyclododecane (HBCDD) – Determination of α-HBCDD, β-HBCDD and γ-HBCDD in plasma by LC-MS/MS. Biomonitoring Method – Translation of the German version from 2020 (10.34865/BI319455E5_3OR)


  9%|▉         | 2407/26843 [00:40<1:15:35,  5.39it/s]

Could not find 2406: Bisphenol S – Ableitung eines BAR. Beurteilungswerte in biologischem Material (10.34865/BB8009D5_2OR)


  9%|▉         | 2413/26843 [00:40<1:01:04,  6.67it/s]

Could not find 2412: 1,4-Dioxan – Addendum zur Reevaluierung des BAT-Wertes. Beurteilungswerte in biologischem Material (10.34865/BB12391D5_2AD)
Could not find 2413: Antimon und seine anorganischen Verbindungen – Addendum zur Ableitung eines BAR. Beurteilungswerte in biologischem Material  (10.34865/BB744036D5_1)


  9%|▉         | 2415/26843 [00:41<1:15:32,  5.39it/s]

Could not find 2414: Antimon und seine Verbindungen – Bestimmung von Antimonspezies in Urin mittels HPLC-ICP-MS. Biomonitoring-Methode (10.34865/BI744036D5_1)
Could not find 2415: Butylhydroxytoluol (BHT) – Bestimmung von 3,5 Di-tert-butyl-4-hydroxybenzoesäure (BHT-Säure) im Urin mittels LC-MS/MS. Biomonitoring-Methode (10.34865/BI12837D5_1)


  9%|▉         | 2417/26843 [00:42<1:30:06,  4.52it/s]

Could not find 2416: 1,4-Dichlorbenzol – Addendum zur Ableitung von BAT-Wert, BAR und EKA. Beurteilungswerte in biologischem Material (10.34865/BB10646D5_2AD)


  9%|▉         | 2418/26843 [00:42<1:37:44,  4.17it/s]

Could not find 2417: Alkohole, Ketone und Ether – Bestimmung von Alkoholen, Ketonen und Ethern in Urin mittels Headspace-GC-MS. Biomonitoring-Methode (10.34865/BI6756D5_2OR)


  9%|▉         | 2419/26843 [00:43<1:46:32,  3.82it/s]

Could not find 2418: N,N-Dimethylacetamid – Addendum zur Reevaluierung des BAT-Wertes. Beurteilungswerte in biologischem Material (10.34865/BB12719D5_3AD)


  9%|▉         | 2422/26843 [00:43<1:29:27,  4.55it/s]

Could not find 2421: Cyanwasserstoff, Cyanide und Cyanidbildner – Bestimmung von Thiocyanat in Plasma/Serum, Urin und Speichel mittels GC-MS. Biomonitoring-Methode (10.34865/BI30204D5_2OR)


  9%|▉         | 2424/26843 [00:44<1:28:42,  4.59it/s]

Could not find 2423: Triphenylphosphat, isopropyliert (isopropylierte Phenylphosphate, IPPhP) – Bestimmung von isopropylierten Phenylphosphaten in der Luft am Arbeitsplatz mittels Gaschromatographie (GC-MS). Luftanalysen-Methode (10.34865/AM6893741D5_3OR)


  9%|▉         | 2436/26843 [00:44<34:05, 11.93it/s]  

Could not find 2426: 1,2,5,6,9,10-Hexabromcyclododecan (HBCDD) – Bestimmung von α-HBCDD, β-HBCDD und γ-HBCDD in Plasma mittels LC-MS/MS. Biomonitoring-Methode (10.34865/BI319455D5_3OR)


  9%|▉         | 2441/26843 [00:45<34:36, 11.75it/s]

Could not find 2440: 2-Butoxyethylacetat. MAK-Begründung, Nachtrag  ( 10.34865/mb11207d5_1)


  9%|▉         | 2445/26843 [00:45<37:53, 10.73it/s]

Could not find 2444: Di-n-butylphosphat und seine technischen Gemische. MAK-Begründung, Nachtrag ( 10.34865/mb10766kskd5_1)


  9%|▉         | 2451/26843 [00:46<35:58, 11.30it/s]

Could not find 2450: Dichloressigsäure und ihre Salze. MAK-Begründung, Nachtrag ( 10.34865/mb7943verd5_1)


  9%|▉         | 2455/26843 [00:46<39:28, 10.30it/s]

Could not find 2454: 4-Nitroanilin. MAK-Begründung, Nachtrag ( 10.34865/mb10001d5_1)
Could not find 2455: Cyanurchlorid. MAK-Begründung (10.34865/mb10877d5_2or)


  9%|▉         | 2457/26843 [00:47<59:25,  6.84it/s]

Could not find 2456: (4-Nonylphenoxy)essigsäure. MAK-Begründung ( 10.34865/mb311549d5_1)


  9%|▉         | 2459/26843 [00:47<1:04:46,  6.27it/s]

Could not find 2458: Graphit (alveolengängige Fraktion). MAK-Begründung, Nachtrag ( 10.34865/mb0228fstd5_2ad)


  9%|▉         | 2460/26843 [00:48<1:16:57,  5.28it/s]

Could not find 2459: Hexachlorethan. MAK-Begründung, Nachtrag ( 10.34865/mb6772d5_2ad)


  9%|▉         | 2461/26843 [00:48<1:29:31,  4.54it/s]

Could not find 2460: Methylamin. MAK-Begründung, Nachtrag (10.34865/mb7489d5_2ad)


  9%|▉         | 2462/26843 [00:49<1:43:46,  3.92it/s]

Could not find 2461: Tantal (alveolengängige Fraktion). MAK-Begründung, Nachtrag ( 10.34865/mb744025stad5_2ad)


  9%|▉         | 2463/26843 [00:49<1:56:41,  3.48it/s]

Could not find 2462: Xylidin, Isomere (2,3-Xylidin, 2,5-Xylidin, 3,4-Xylidin, 3,5-Xylidin). MAK-Begründung, Nachtrag ( 10.34865/mb8759ismd5_2ad)


  9%|▉         | 2464/26843 [00:49<2:07:38,  3.18it/s]

Could not find 2463: Xylol (alle Isomere). MAK-Begründung, Nachtrag (10.34865/mb133020d5_2ad)


  9%|▉         | 2470/26843 [00:50<1:05:12,  6.23it/s]

Could not find 2469: Bisphenol-A-diglycidylether. MAK-Begründung, Nachtrag (10.34865/mb167554d5_3ad)


  9%|▉         | 2471/26843 [00:50<1:18:29,  5.17it/s]

Could not find 2470: Trikresylphosphat, Isomere, „frei von o-Isomeren". MAK-Begründung ( 10.34865/mb133078d5_3or)


  9%|▉         | 2472/26843 [00:51<1:38:43,  4.11it/s]

Could not find 2471: Trikresylphosphat, Summe aller o-Isomere. MAK-Begründung (10.34865/mb7830d5_3or)


  9%|▉         | 2473/26843 [00:51<1:52:11,  3.62it/s]

Could not find 2472: Kresylglycidylether (o-Isomer, Isomerengemisch). MAK-Begründung (10.34865/mb221079d5_3or)


  9%|▉         | 2486/26843 [00:52<32:21, 12.55it/s]  

Could not find 2476: Der Chimäre auf der Spur: Forschungsdaten in den Geisteswissenschaften (10.5282/O-BIB/2018H2S142-162)


  9%|▉         | 2497/26843 [00:52<25:07, 16.15it/s]

Could not find 2496: Kerosin (Erdöl). MAK-Begründung (10.34865/mb800820d5_2or)


  9%|▉         | 2502/26843 [00:53<28:06, 14.44it/s]

Could not find 2501: Vinylacetat. MAK-Begründung, Nachtrag (10.34865/mb10805d5_2ad)


  9%|▉         | 2505/26843 [00:53<33:32, 12.09it/s]

Could not find 2503: 1,1,2,2-Tetrachlorethan. MAK-Begründung, Nachtrag ( 10.34865/mb7934d5_2ad)


  9%|▉         | 2517/26843 [00:54<22:24, 18.10it/s]

Could not find 2505: Ammoniak. MAK-Begründung, Nachtrag (10.34865/mb766441d5_1)


 11%|█         | 2900/26843 [00:58<07:46, 51.28it/s] 

Could not find 2890: Co-integration and causality relationship between energy consumption and economic growth: further empirical evidence for Nigeria (10.3846/jbem.202010.05)


 12%|█▏        | 3330/26843 [01:02<06:47, 57.70it/s] 

Could not find 3312: Model-Driven Architecture Based Software Development for Epidemiological Surveillance Systems (10.3233/SHTI190279)


 14%|█▎        | 3654/26843 [01:05<07:32, 51.21it/s] 

Could not find 3643: A stochastic model for designing last mile relief networks (10.1287/trsc,2015.0621)


 14%|█▎        | 3674/26843 [01:06<08:36, 44.87it/s]

Could not find 3656: Stochastic network models for logistics planning in disaster relief (10.1061/j.ejor.2016.04.041)


 15%|█▍        | 3960/26843 [01:08<06:37, 57.54it/s] 


KeyboardInterrupt: 

In [11]:
db.close()