# ORKG Data preparation

This notebook prepares a dataframe from the ORKG data.

In [2]:
import sqlite3

# Open a connection to the SQLite database file
db = sqlite3.connect("data/datalake.db")

# Create a cursor object to execute SQL statements
cursor = db.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS orkg_subjects (id TEXT PRIMARY KEY, data JSON)")

<sqlite3.Cursor at 0x7f6551110f80>

# Label Mapping

The ORKG data is inconsistent in the naming of labels.

```json
{
  "author": "P27",
  "doi": "P26",
  "publication month": "P28",
  "publication year": "P29",
  "publisher": "HAS_VENUE",
  "title": "rdf-schema#label",
  "url": "url"
}
```

In [3]:
import json
from src.utils.utils_json import write_json, print_json

data = []

for id, raw in cursor.execute("SELECT * FROM orkg_statements"):
    statements = json.loads(raw)["statements"]

    title = statements[0]["subject"]["label"]

    labels = {}
    for statement in statements:
        label_name = statement['predicate']['label']

        if label_name not in labels:
            labels[label_name] = []

        label = statement['object']['label']
        labels[label_name].append(label)

    # Remove duplicates and trim whitespace
    for label_name in labels:
        labels[label_name] = [label.strip() for label in labels[label_name]]
        labels[label_name] = list(set(labels[label_name]))
        # Remove empty labels
        labels[label_name] = [label for label in labels[label_name] if label]

    # Remove empty labels
    labels = {k: v for k, v in labels.items() if len(v) > 0}

    data.append({
        "id": id,
        "title": title,
        "labels": labels,
    })

In [4]:
write_json("data/orkg.json", data)

In [5]:
print_json(data[:5])

[
  {
    "id": "R209491",
    "labels": {
      "author": [
        "R. Scherer",
        "R. Windisch",
        "M. Kadolsky"
      ],
      "contribution": [
        "Contribution 1"
      ],
      "description": [
        "Covers systems organization and hardware architecture. Roughly includes material in ACM Subject Classes C.0, C.1, and C.5."
      ],
      "doi": [
        "10.1109/EESMS.2015.7175848"
      ],
      "has subfield": [
        "Digital Communications and Networking",
        "Data Storage Systems",
        "Computer and Systems Architecture",
        "Hardware Systems",
        "Digital Circuits",
        "Robotics"
      ],
      "publication year": [
        "2015"
      ],
      "research field": [
        "Computer Engineering"
      ],
      "same as": [
        "Arxiv ID: cs.RO",
        "Arxiv ID: cs.AR"
      ],
      "url": [
        "https://ieeexplore.ieee.org/abstract/document/7175848"
      ],
      "venue": [
        "2015 IEEE Workshop on Environmen

In [6]:
df_input = {
    "id": [],
    "title": [],
    "doi": [],
    "research field": [],
    "subfields": [],
}

for row in data:
    df_input["id"].append(row["id"])
    df_input["title"].append(row["title"])
    df_input["doi"].append(row["labels"].get("doi", []))
    df_input["subfields"].append(row["labels"].get("has subfield", []))

    rf = row["labels"].get("research field")
    if rf:
        df_input["research field"].append(rf[0])
    else:
        df_input["research field"].append("")

In [7]:
import pandas as pd

df = pd.DataFrame(df_input)
df.to_csv("data/orkg.csv", index=False)
df

Unnamed: 0,id,title,doi,research field,subfields
0,R209491,Knowledge management framework for monitoring ...,[10.1109/EESMS.2015.7175848],Computer Engineering,"[Digital Communications and Networking, Data S..."
1,R585639,Contribution 1,[],Theoretical Computer Science,[]
2,R504922,A Strong Baseline for Fashion Retrieval with P...,[],Computer Sciences,[Software Engineering and Programming Language...
3,R146959,Contribution 1,[10.1109/tbme.2018.2880927],Control Theory,[]
4,R191366,Contribution 1,[10.1145/3289600.3290994],Science and Technology Studies,[]
...,...,...,...,...,...
26740,R44879,Virology,[10.1016/j.ijid.2020.02.033],Virology,[]
26741,R521270,Comprehensive Attention Self-Distillation for ...,[],Computer Sciences,[Software Engineering and Programming Language...
26742,R554286,How Do Graph Networks Generalize to Large and ...,[],Computer Sciences,[Software Engineering and Programming Language...
26743,R532777,Improving the Gating Mechanism of Recurrent Ne...,[],Computer Sciences,[Software Engineering and Programming Language...


In [8]:
subfields = set()

for inx, row in df.iterrows():
    if row["subfields"]:
        subfields.update(row["subfields"])

subfields = list(subfields)
subfields.sort()

print("Subfields:", len(subfields))
print_json(subfields)

Subfields: 692
[
  "Accelerator Physics",
  "Acoustics, Dynamics, and Controls",
  "Adaptation and Self-Organizing Systems",
  "Aerodynamics and Fluid Mechanics",
  "Aeronautical Vehicles",
  "Aerospace Engineering",
  "African",
  "African Languages and Societies",
  "African Studies",
  "Agricultural Economics, Agricultural Policy, Agricultural Sociology",
  "Agricultural and Resource Economics",
  "Agriculture, Forestry and Veterinary Medicine",
  "Agronomy and Crop Sciences",
  "Algebra",
  "Algebraic Geometry",
  "Algebraic Topology",
  "American Art and Architecture",
  "American Film Studies",
  "American Literature",
  "American Material Culture",
  "American Politics",
  "American Popular Culture",
  "American Studies",
  "Anaesthesiology",
  "Analysis",
  "Analytical Chemistry",
  "Anatomy",
  "Ancient History (Greek and Roman through Late Antiquity)",
  "Ancient Philosophy",
  "Ancient, Medieval, Renaissance and Baroque Art and Architecture",
  "Animal Breeding, Animal Nutri

In [9]:
# Count subfields
subfield_count = {}

for inx, row in df.iterrows():
    for subfield in row["subfields"]:
        if subfield not in subfield_count:
            subfield_count[subfield] = 0

        subfield_count[subfield] += 1

print_json(subfield_count)

{
  "Accelerator Physics": 3997,
  "Acoustics, Dynamics, and Controls": 4114,
  "Adaptation and Self-Organizing Systems": 3997,
  "Aerodynamics and Fluid Mechanics": 4126,
  "Aeronautical Vehicles": 4126,
  "Aerospace Engineering": 4112,
  "African": 4014,
  "African Languages and Societies": 4014,
  "African Studies": 4060,
  "Agricultural Economics, Agricultural Policy, Agricultural Sociology": 4025,
  "Agricultural and Resource Economics": 4058,
  "Agriculture, Forestry and Veterinary Medicine": 4024,
  "Agronomy and Crop Sciences": 4024,
  "Algebra": 3994,
  "Algebraic Geometry": 3994,
  "Algebraic Topology": 3994,
  "American Art and Architecture": 4014,
  "American Film Studies": 4015,
  "American Literature": 4015,
  "American Material Culture": 4015,
  "American Politics": 4058,
  "American Popular Culture": 4015,
  "American Studies": 4087,
  "Anaesthesiology": 4038,
  "Analysis": 3994,
  "Analytical Chemistry": 4090,
  "Anatomy": 4024,
  "Ancient History (Greek and Roman thro