In [None]:
with open("../data/modeling_paper_ids.txt") as f:
    paper_ids = f.read().splitlines()

len(paper_ids)

In [None]:
import requests
import re

# ref https://api.medrxiv.org/
MEDRXIV_API_DETAILS_URL = "https://api.medrxiv.org/details/{server}/{doi}/na/{format}"

results = []

for doi in paper_ids:
    if not re.match("10\.\d{4,9}\/", doi):
        print(f"not doi: {doi}")
        continue

    url = MEDRXIV_API_DETAILS_URL.format(server="medrxiv", doi=doi, format="json")

    resp = requests.get(url=url)
    data = resp.json()

    if data["messages"][0]["status"] != "ok":
        print(f"error: {doi}")
        continue

    results.append(data)

len(results)

In [None]:
data = [x["collection"][0] for x in results]

len(data)

## arXiv

Retrieve information for the papers that include arxiv in their ids.

In [None]:
%pip install arxiv

In [None]:
arxiv_ids = [
    "2006.01754v1",
    "2007.06541v1",
    "2004.12799v1",
    "2005.00106v1",
    "2004.02605v2",
]

In [None]:
import arxiv

client = arxiv.Client()

for id in arxiv_ids:
    search = arxiv.Search(id_list=[id])
    result = next(client.results(search))

    authors = [str(x) for x in result.authors]

    paper = {
        "doi": id,
        "title": result.title,
        "abstract": result.summary,
        "date": str(result.published.date()),
        "authors": "; ".join(authors),
        "server": "arxiv",
    }

    data.append(paper)

len(data)

## manual

Manually add informaiton for the remaining papers.

In [None]:
other_papers = [
    {
        "doi": "10.1063/5.0008834",
        "title": "Asymptotic estimates of SARS-CoV-2 infection counts and their sensitivity to stochastic perturbation",
        "abstract": "Despite the importance of having robust estimates of the time-asymptotic total number of infections, early estimates of COVID-19 show enormous fluctuations. Using COVID-19 data from different countries, we show that predictions are extremely sensitive to the reporting protocol and crucially depend on the last available data point before the maximum number of daily infections is reached. We propose a physical explanation for this sensitivity, using a susceptible–exposed–infected–recovered model, where the parameters are stochastically perturbed to simulate the difficulty in detecting patients, different confinement measures taken by different countries, as well as changes in the virus characteristics. Our results suggest that there are physical and statistical reasons to assign low confidence to statistical and dynamical fits, despite their apparently good statistical scores. These considerations are general and can be applied to other epidemics. COVID-19 is currently affecting over 180 countries worldwide and poses serious threats to public health as well as economic and social stability of many countries. Modeling and extrapolating in near real-time the evolution of COVID-19 epidemics is a scientific challenge, which requires a deep understanding of the non-linearities undermining the dynamics of the epidemics. Here, we show that real-time predictions of COVID-19 infections are extremely sensitive to errors in data collection and crucially depend on the last available data point. We test these ideas in both statistical (logistic) and dynamical (susceptible–exposed–infected–recovered) models that are currently used to forecast the evolution of the COVID-19 epidemic. Our goal is to show how uncertainties arising from both poor data quality and inadequate estimations of model parameters (incubation, infection, and recovery rates) propagate to long-term extrapolations of infection counts. We provide guidelines for reporting those uncertainties to the scientific community and the general public.",
    },
    {
        "doi": "10.1080/17513758.2020.1795285",
        "title": "Effects of age-targeted sequestration for COVID-19",
        "abstract": "We model the extent to which age-targeted protective sequestration can be used to reduce ICU admissions caused by novel coronavirus COVID-19. Using demographic data from New Zealand, we demonstrate that lowering the age threshold to 50 years of age reduces ICU admissions drastically and show that for sufficiently strict isolation protocols, sequestering one-third of the countries population for a total of 8 months is sufficient to avoid overwhelming ICU capacity throughout the entire course of the epidemic. Similar results are expected to hold for other countries, though some minor adaption will be required based on local age demographics and hospital facilities.",
    },
    {
        "doi": "10.1016/j.mjafi.2020.03.022",
        "title": "Healthcare impact of COVID-19 epidemic in India: A stochastic mathematical model",
        "abstract": "Background: In India, the SARS-CoV-2 COVID-19 epidemic has grown to 1251 cases and 32 deaths as on 30 Mar 2020. The healthcare impact of the epidemic in India was studied using a stochastic mathematical model. Methods: A compartmental SEIR model was developed, in which the flow of individuals through compartments is modeled using a set of differential equations. Different scenarios were modeled with 1000 runs of Monte Carlo simulation each using MATLAB. Hospitalization, intensive care unit (ICU) requirements, and deaths were modeled on SimVoi software. The impact of nonpharmacological interventions (NPIs) including social distancing and lockdown on checking the epidemic was estimated. Results: Uninterrupted epidemic in India would have resulted in more than 364 million cases and 1.56 million deaths with peak by mid-July. As per the model, at current growth rate of 1.15, India is likely to reach approximately 3 million cases by 25 May, implying 125,455 (±18,034) hospitalizations, 26,130 (±3298) ICU admissions, and 13,447 (±1819) deaths. This would overwhelm India's healthcare system. The model shows that with immediate institution of NPIs, the epidemic might still be checked by mid-April 2020. It would then result in 241,974 (±33,735) total infections, 10,214 (±1649) hospitalizations, 2121 (±334) ICU admissions, and 1081 (±169) deaths. Conclusion: At the current growth rate of epidemic, India's healthcare resources will be overwhelmed by the end of May. With the immediate institution of NPIs, total cases, hospitalizations, ICU requirements, and deaths can be reduced by almost 90%.",
    },
    {
        "doi": "10.3934/publichealth.2020026",
        "title": "Parameter estimation and prediction for coronavirus disease outbreak 2019 (COVID-19) in Algeria.",
        "abstract": "Background: The wave of the coronavirus disease outbreak in 2019 (COVID-19) has spread all over the world. In Algeria, the first case of COVID-19 was reported on 25 February, 2020, and the number of confirmed cases of it has increased day after day. To overcome this difficult period and a catastrophic scenario, a model-based prediction of the possible epidemic peak and size of COVID-19 in Algeria is required. Methods: We are concerned with a classical epidemic model of susceptible, exposed, infected and removed (SEIR) population dynamics. By using the method of least squares and the best fit curve that minimizes the sum of squared residuals, we estimate the epidemic parameter and the basic reproduction number R0. Moreover, we discuss the effect of intervention in a certain period by numerical simulation. Results: We find that R0= 4.1, which implies that the epidemic in Algeria could occur in a strong way. Moreover, we obtain the following epidemiological insights: the intervention has a positive effect on the time delay of the epidemic peak; the epidemic size is almost the same for a short intervention; a large epidemic can occur even if the intervention is long and sufficiently effective. Conclusion: Algeria must implement the strict measures as shown in this study, which could be similar to the one that China has finally adopted.",
    },
    {
        "doi": "10.1371/journal.pone.0234763",
        "title": "Prediction of COVID-19 spreading profiles in South Korea, Italy and Iran by data-driven coding",
        "abstract": "This work applies a data-driven coding method for prediction of the COVID-19 spreading profile in any given population that shows an initial phase of epidemic progression. Based on the historical data collected for COVID-19 spreading in 367 cities in China and the set of parameters of the augmented Susceptible-Exposed-Infected-Removed (SEIR) model obtained for each city, a set of profile codes representing a variety of transmission mechanisms and contact topologies is formed. By comparing the data of an early outbreak of a given population with the complete set of historical profiles, the best fit profiles are selected and the corresponding sets of profile codes are used for prediction of the future progression of the epidemic in that population. Application of the method to the data collected for South Korea, Italy and Iran shows that peaks of infection cases are expected to occur before mid April, the end of March and the end of May 2020, and that the percentage of population infected in each city or region will be less than 0.01%, 0.5% and 0.5%, for South Korea, Italy and Iran, respectively.",
    },
    {
        "doi": "10.1016/j.annepidem.2020.07.007",
        "title": "Risk for COVID-19 infection and death among Latinos in the United States: examining heterogeneity in transmission dynamics.",
        "abstract": "Purpose: The purpose of this study was to ascertain COVID-19 transmission dynamics among Latino communities nationally. Methods: We compared predictors of COVID-19 cases and deaths between disproportionally Latino counties (≥17.8% Latino population) and all other counties through May 11, 2020. Adjusted rate ratios (aRRs) were estimated using COVID-19 cases and deaths via zero-inflated binomial regression models. Results: COVID-19 diagnoses rates were greater in Latino counties nationally (90.9 vs. 82.0 per 100,000). In multivariable analysis, COVID-19 cases were greater in Northeastern and Midwestern Latino counties (aRR: 1.42, 95% CI: 1.11-1.84, and aRR: 1.70, 95% CI: 1.57-1.85, respectively). COVID-19 deaths were greater in Midwestern Latino counties (aRR: 1.17, 95% CI: 1.04-1.34). COVID-19 diagnoses were associated with counties with greater monolingual Spanish speakers, employment rates, heart disease deaths, less social distancing, and days since the first reported case. COVID-19 deaths were associated with household occupancy density, air pollution, employment, days since the first reported case, and age (fewer <35 yo). Conclusions: COVID-19 risks and deaths among Latino populations differ by region. Structural factors place Latino populations and particularly monolingual Spanish speakers at elevated risk for COVID-19 acquisition.",
    },
    {
        "doi": "10.12688/wellcomeopenres.15788.1",
        "title": "The contribution of pre-symptomatic infection to the transmission dynamics of COVID-2019",
        "abstract": "Background: Pre-symptomatic transmission can be a key determinant of the effectiveness of containment and mitigation strategies for infectious diseases, particularly if interventions rely on syndromic case finding. For COVID-19, infections in the absence of apparent symptoms have been reported frequently alongside circumstantial evidence for asymptomatic or pre-symptomatic transmission. We estimated the potential contribution of pre-symptomatic cases to COVID-19 transmission. Methods: Using the probability for symptom onset on a given day inferred from the incubation period, we attributed the serial interval reported from Shenzen, China, into likely pre-symptomatic and symptomatic transmission. We used the serial interval derived for cases isolated more than 6 days after symptom onset as the no active case finding scenario and the unrestricted serial interval as the active case finding scenario. We reported the estimate assuming no correlation between the incubation period and the serial interval alongside a range indicating alternative assumptions of positive and negative correlation. Results: We estimated that 23% (range accounting for correlation: 12 - 28%) of transmissions in Shenzen may have originated from presymptomatic infections. Through accelerated case isolation following symptom onset, this percentage increased to 46% (21 - 46%), implying that about 35% of secondary infections among symptomatic cases have been prevented. These results were robust to using reported incubation periods and serial intervals from other settings. Conclusions: Pre-symptomatic transmission may be essential to consider for containment and mitigation strategies for COVID-19.",
    },
]

In [None]:
data = data + other_papers

len(data)

## store the paper information

In [None]:
import json

with open("../data/modeling_papers.json", "w") as f:
    json.dump(data, f)

## scholarly

Attempt to use scholarly to retrive the information for the remaining pappers.

NOTE: Google Scholar limits access from bots, and therefore using scholarly is NOT feasbile. Therefore, this section is incomplete and for reference only.

In [None]:
%pip install scholarly

In [None]:
other_ids = [
    "10.1063/5.0008834",
    "10.1080/17513758.2020.1795285",
    "10.1016/j.mjafi.2020.03.022",
    "10.3934/publichealth.2020026",
    "10.1371/journal.pone.0234763",
    "10.1016/j.annepidem.2020.07.007",
    "10.12688/wellcomeopenres.15788.1",
]

In [None]:
from scholarly import scholarly

for id in other_ids:
    search_query = scholarly.search_pubs(id)
    result = next(search_query)

    scholarly.pprint(result)

    print(result["bib"]["abstract"])

## download non-disease modeling pappers, for negative test cases

In [6]:
import arxiv

data = []

client = arxiv.Client()

search = arxiv.Search(query="modeling techniques", max_results=20, sort_by=arxiv.SortCriterion.Relevance)

for result in client.results(search):
    authors = [str(x) for x in result.authors]

    paper = {
        "doi": result.entry_id,
        "title": result.title,
        "abstract": result.summary,
        "date": str(result.published.date()),
        "authors": "; ".join(authors),
        "server": "arxiv",
    }

    data.append(paper)

In [7]:
import json

with open("../data/non_modeling_papers.json", "w") as f:
    json.dump(data, f)