In [1]:
import warnings
from collections import defaultdict
from os.path import join as j
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
with open("../data_dirs.txt", "r") as f: # your dataset folder should  be in a text file here
    DATA_DIRS = f.readlines()
    DATA_DIRS = [p.strip() for p in DATA_DIRS]

In [3]:
PACS_PATH = j(DATA_DIRS[0], "raw", "APS", "cleaned_pacs.csv")

In [4]:
pacs = pd.read_csv(PACS_PATH, header=0, index_col="DOI", dtype="str")

In [5]:
PROPERTY = "abstract"
EMBEDDING_METHOD = "sentencebert"
EMBEDDING_PATH = j(DATA_DIRS[0], "embedding", "vectors", "aps_all_2010_{}_{}_vector.gz".format(PROPERTY, EMBEDDING_METHOD))

vector = pd.read_csv(EMBEDDING_PATH, header=0).drop_duplicates("DOI").set_index("DOI") # why do SOME (abstract??) vectors have duplicate indices????
DIM = vector.shape[1]
vector.columns = list(range(0, DIM))
vector = vector.merge(
    pacs, left_index=True, right_index=True
)  # For this analysis, let's ignore papers with no embedding or no PACS
    
# Calculate ROG per PACS level
rogs = []
levels = [5]#[1, 2, 3, 5]
for k in levels:
    pacs_centroid = vector.groupby("PACS_CODE_{}".format(k)).agg(
        lambda x: x.unique().mean()
    )
    pacs_centroid = (
        vector.reset_index()[["DOI", "PACS_CODE_{}".format(k)]]
        .merge(pacs_centroid, on="PACS_CODE_{}".format(k))
        .drop_duplicates()
        .set_index(["PACS_CODE_{}".format(k), "DOI"])
        .sort_index()
    )
    vector_k = (
        vector.reset_index()[list(range(0, DIM)) + ["PACS_CODE_{}".format(k), "DOI"]]
        .drop_duplicates()
        .rename(columns={"PACS_CODE_{}".format(k): "PACS_CODE"})
    )
    vector_k["PACS_LEVEL"] = str(k)
    vector_k = vector_k.set_index(["PACS_LEVEL", "PACS_CODE", "DOI"]).sort_index()
    cos_dist = 1 - (
        np.einsum("ij,ij->i", vector_k.values, pacs_centroid.values)
        / (
            np.linalg.norm(vector_k.values, axis=1)
            * np.linalg.norm(pacs_centroid.values, axis=1)
        )
    )
    rog = pd.DataFrame(data=pd.Series(cos_dist ** 2, name="SQ_DIST"))
    rog[["PACS_LEVEL", "PACS_CODE", "DOI"]] = vector_k.reset_index()[
        ["PACS_LEVEL", "PACS_CODE", "DOI"]
    ]
    rog = (
        rog.groupby(["PACS_LEVEL", "PACS_CODE"])
        .agg({"DOI": "nunique", "SQ_DIST": "mean"})
        .rename(columns={"SQ_DIST": "ROG", "DOI": "PAPER_CT"})
    )
    rogs.append(rog)
rogs = pd.concat(rogs).reset_index()

In [6]:
ROG_PATH = j(DATA_DIRS[1], "derived", "rog", "aps_all_2010_{}_{}_rog.csv".format(PROPERTY, EMBEDDING_METHOD))
rogs_all = pd.read_csv(ROG_PATH, header=0)

In [7]:
rogs = pd.concat([rogs_all,rogs]).reset_index().drop("index",axis=1)

In [8]:
rogs

Unnamed: 0,PACS_LEVEL,PACS_CODE,PAPER_CT,ROG
0,1,0,42499,0.175244
1,1,1,22220,0.128093
2,1,2,11180,0.126183
3,1,3,14275,0.150754
4,1,4,23095,0.175533
...,...,...,...,...
6343,5,9880Qc,208,0.085817
6344,5,9880–k,31,0.102841
6345,5,9880−k,430,0.110970
6346,5,9890+s,3,0.064531


In [9]:
ROG_PATH = j(DATA_DIRS[0], "derived", "rog", "aps_all_2010_{}_{}_rog.csv".format(PROPERTY, EMBEDDING_METHOD))
rogs.to_csv(ROG_PATH, header=True, index=False)