##### This notebook involves some processing of the data from the openalex climate change dataset. It then filters the needed nodes for the graph database

In [None]:
import pandas as pd
import os
import string
import json_lines
import fasttext
from unidecode import unidecode
from langdetect import detect, DetectorFactory
from langdetect import LangDetectException

In [8]:
import json
with open("../data/open_alex/OA_2000.json", "r") as f:
    oa_2000 = json.load(f)

In [10]:
for w in oa_2000:
    print(f"{type(w)}")
    break

<class 'dict'>


In [11]:
w.keys()

dict_keys(['id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'open_access', 'authorships', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_payment', 'cited_by_count', 'biblio', 'is_retracted', 'is_paratext', 'concepts', 'mesh', 'locations', 'best_oa_location', 'grants', 'referenced_works', 'related_works', 'ngrams_url', 'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year', 'updated_date', 'created_date'])

In [None]:
class ReadOAJson:
    def __init__(self, filename):
        self.filename = filename

    def get_data(self):
        fulldata = []
        with open(self.filename) as f:
            datalines = [l for l in json_lines.reader(f)]
            jsonlen = len(
                datalines[0]
            )  # since each file contains several jsonlines, its looped
            for idx in range(jsonlen):
                fulldata.extend(datalines[0][idx])
            datadf = pd.DataFrame(fulldata)
            return datadf

In [None]:
foldername = "../data/OpenAlex_Climate_Change/"
allfiles = os.listdir(foldername)
df = pd.DataFrame()

for eachfile in allfiles:
    print("Processing . . ", eachfile)
    filename = ReadOAJson(foldername + eachfile)
    data = filename.get_data()
    df = df.append(data, ignore_index=True)

In [None]:
df.columns

Fasttext model is used to detect which language the abstract is written in. I have already tried out langdetect, langid. So far fasttext is the fastest one. But it needs the pretrained model to be loaded and it could be found from [lid.176.bin(126mb)](https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin)

In [None]:
fmodel = fasttext.load_model("lid.176.bin")

In [None]:
def get_abstract(abs_idx):
    if abs_idx:
        words = [(kv[0], v) for kv in abs_idx.items() for v in kv[1]]
        para = " ".join([w[0] for w in sorted(words, key=lambda x: x[1])])
        para = ". ".join([p for p in para.split(".")])
        # the len is to filter those abstracts with just punctuations
        if len(para.translate(str.maketrans("", "", string.punctuation)).strip()) > 5:
            # the fasttext model predicts only without new lines and [-2:] is to extract only the language id like en, de, it
            return para, fmodel.predict(para.replace("\n", ""))[0][0][-2:]
    return None, None

In [None]:
df["PublicationAbstract"], df["Language"] = zip(
    *df["abstract_inverted_index"].apply(get_abstract)
)

In [None]:
df["Author"] = df["authorships"].apply(
    lambda x: x[0]["author"]["display_name"] if x else None
)

In [None]:
df["Author"] = df["Author"].apply(lambda x: unidecode(x) if x else None)

In [None]:
df["OpenAlexConcept"] = df["concepts"].apply(lambda x: [i["display_name"] for i in x])

In [None]:
df["Publication"] = df["host_venue"].apply(lambda x: x["publisher"])

In [None]:
df["Venue"] = df["host_venue"].apply(lambda x: x["display_name"])

In [None]:
df["Title"] = df["title"]

In [None]:
df["timestamp"] = df["publication_date"]

In [None]:
has_abstract: bool = None

In [None]:
nodes = [
    "timestamp",
    "Title",
    "Author",
    "OpenAlexConcept",
    "PublicationAbstract",
    "Publication",
    "Language",
    "Venue",
]

In [None]:
df_nodes = df[nodes]

In [None]:
df_nodes.to_json("../data/generated/openalex_climate_nodes.json", orient="records")

In [None]:
# Reading back from the file
df = pd.read_json("../data/generated/openalex_climate_nodes.json")

In [None]:
df

In [None]:
has_abstract: bool = None

In [None]:
from oa_tool.downloader import OpenAlexDownloader

OpenAlexDownloader(
    1950,
    1952,
    concepts=["C132651083"],
    email="roxanne.elbaff@dlr.de",
    has_abstract=True,
).build_request_str(1900)

In [None]:
abstractInvertedIndex = {
    "Abstract": [0],
    "An": [1],
    "extended": [2],
    "period": [3],
    "numerical": [4],
    "integration": [5],
    "of": [6, 21, 24, 79, 97, 118, 138],
    "a": [7, 62, 71, 77, 95, 114],
    "baroclinic": [8],
    "primitive": [9],
    "equation": [10],
    "model": [11],
    "has": [12],
    "been": [13],
    "made": [14],
    "for": [15, 104],
    "the": [16, 19, 22, 25, 40, 54, 66, 84, 88, 98, 102, 107, 129],
    "simulation": [17],
    "and": [18, 61, 87, 136, 147],
    "study": [20],
    "dynamics": [23],
    "atmosphere's": [26],
    "general": [27],
    "circulation.": [28],
    "The": [29, 47, 73, 116, 151],
    "solution": [30],
    "corresponding": [31, 82],
    "to": [32, 44, 83, 93, 142],
    "external": [33, 134],
    "gravitational": [34],
    "propagation": [35],
    "is": [36, 76, 91, 111, 120],
    "filtered": [37],
    "by": [38],
    "requiring": [39],
    "vertically": [41],
    "integrated": [42],
    "divergence": [43],
    "vanish": [45],
    "identically.": [46],
    "vertical": [48],
    "structure": [49],
    "permits": [50],
    "as": [51, 70, 113, 125],
    "dependent": [52],
    "variables": [53],
    "horizontal": [55],
    "wind": [56],
    "at": [57],
    "two": [58],
    "internal": [59],
    "levels": [60],
    "single": [63],
    "temperature,": [64],
    "with": [65],
    "static": [67, 130],
    "stability": [68],
    "entering": [69],
    "parameter.": [72, 115],
    "incoming": [74],
    "radiation": [75, 90],
    "function": [78, 96],
    "latitude": [80],
    "only": [81, 124],
    "annual": [85],
    "mean,": [86],
    "outgoing": [89],
    "taken": [92, 121],
    "be": [94],
    "local": [99],
    "temperature.": [100],
    "With": [101],
    "requirement": [103],
    "thermal": [105],
    "equilibrium,": [106],
    "domain": [108],
    "mean": [109],
    "temperature": [110],
    "specified": [112],
    "role": [117],
    "condensation": [119],
    "into": [122],
    "account": [123],
    "it": [126],
    "effectively": [127],
    "reduces": [128],
    "stability.": [131],
    "All": [132],
    "other": [133, 145],
    "sources": [135],
    "sinks": [137],
    "heat": [139],
    "are": [140, 148, 153],
    "assumed": [141],
    "balance": [143],
    "each": [144],
    "locally,": [146],
    "thus": [149],
    "omitted.": [150],
    "kinematics": [152],
    "th...": [154],
}

In [None]:
def from_inverted_to_text(abstracted_inverted_index):
    word_index = []
    for k, v in abstracted_inverted_index.items():
        for index in v:
            word_index.append([k, index])
    word_index = sorted(word_index, key=lambda x: x[1])
    abstract = " ".join([x[0] for x in word_index])
    return abstract

In [None]:
import pandas as pd
import json

# Load the JSON data
with open("oa_climate_change.json", "r") as f:
    data = json.load(f)
    print(data.keys())
    print(data["meta"])
    works = []
    for res in data["results"]:
        if (
            "abstract_inverted_index" in res.keys()
            and res["abstract_inverted_index"] is not None
        ):
            res["abstract_text"] = from_inverted_to_text(res["abstract_inverted_index"])
            del res["abstract_inverted_index"]
        works.append(res)


# Normalize the JSON data to a Pandas DataFrame
df = pd.json_normalize(works, max_level=3)

In [None]:
df.columns