# PDF mining

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
from dotenv import load_dotenv, find_dotenv
import pymongo
import time

# load_dotenv(find_dotenv)
%load_ext dotenv
%dotenv

In [2]:
base_url = "https://ml4physicalsciences.github.io/"

data = {
    "title": [],
    "authors": []
}

In [3]:
for y in range(2017,2021):
    print(f"{base_url}{y}")
    url = f"{base_url}{y}"
    r = requests.get(url)
    print(r.status_code)
    if r.status_code == 200:
        soup = BeautifulSoup(r.content, 'html.parser')
        html_table = soup.select_one("section#papers").select_one("div.table-wrapper")
        for el in html_table.select("td"):
            if not el.text.isdigit():
                title, *_, authors = re.split(r'\[(pdf|poster|video)\]', el.text.strip())
                data["title"].append(title.strip())
                data["authors"].append(authors.strip())
    else:
        print("PAGE NOT FOUND")
        
df = pd.DataFrame(data)
df.head()

https://ml4physicalsciences.github.io/2017
200
https://ml4physicalsciences.github.io/2018
404
PAGE NOT FOUND
https://ml4physicalsciences.github.io/2019
200
https://ml4physicalsciences.github.io/2020
200


Unnamed: 0,title,authors
0,Adversarial learning to eliminate systematic e...,"Victor Estrade, Cecile Germain, Isabelle Guyon..."
1,Variational Inference over Non-differentiable ...,"Adam McCarthy, Blanca Rodriguez and Ana Minchole"
2,Deep topology classifiers for a more efficient...,"Daniel Weitekamp III, Thong Q. Nguyen, Dustin ..."
3,Nanophotonic Particle Simulation and Inverse D...,"John Peurifoy, Yichen Shen, Li Jing, Yi Yang, ..."
4,FlareNet: A Deep Learning Framework for Solar ...,"Sean McGregor, Dattaraj Dhuri, Anamaria Berea ..."


In [4]:
os.environ.get("CORE_API_KEY")

'Cb0xi51dUWja8ZkNmegJYFvt64Dslznq'

In [5]:
print(df.iloc[1].title)
print(df.iloc[1].authors)

Variational Inference over Non-differentiable Cardiac Simulators using Bayesian Optimization
Adam McCarthy, Blanca Rodriguez and Ana Minchole


In [6]:
API_KEY = os.environ.get("CORE_API_KEY")
q_params = {'apiKey': API_KEY}
query="Variational%20Inference%20over%20Non-differentiable%20Cardiac%20Simulators%20using%20Bayesian%20Optimization%20Adam%20McCarthy%2C%20Blanca%20Rodriguez%20and%20Ana%20Minchole"
r = requests.get(f"https://core.ac.uk:443/api-v2/search/{query}", params=q_params)
print(r.status_code)

200


In [7]:
r.json()['data'][0]

{'_index': 'articles_2021_04_23',
 '_type': 'article',
 '_id': '141537424',
 '_score': 93.81691,
 '_source': {'id': '141537424',
  'authors': ['McCarthy, Adam', 'Rodriguez, Blanca', 'Minchole, Ana'],
  'citations': [],
  'contributors': [],
  'datePublished': '2017-12-09T00:00:00',
  'deleted': 'ALLOWED',
  'description': 'Performing inference over simulators is generally intractable as their\nruntime means we cannot compute a marginal likelihood. We develop a\nlikelihood-free inference method to infer parameters for a cardiac simulator,\nwhich replicates electrical flow through the heart to the body surface. We\nimprove the fit of a state-of-the-art simulator to an electrocardiogram (ECG)\nrecorded from a real patient.Comment: Workshops on Deep Learning for Physical Sciences and Machine Learning\n  4 Health, NIPS 201',
  'fullText': 'Variational Inference over Non-differentiable Cardiac\nSimulators using Bayesian Optimization\nAdam McCarthy1, Blanca Rodriguez1, and Ana Mincholé1\n1 De

In [8]:
user = os.environ.get('MONGO_DB_USERNAME')
password = os.environ.get('MONGO_DB_PASSWORD')

client = pymongo.MongoClient(f"mongodb+srv://{user}:{password}@maincluster.otbuf.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")

print(client.list_database_names())
db = client.neurips

print(db.list_collection_names())

collection = db.ml4physics

['neurips', 'sample_airbnb', 'sample_analytics', 'sample_geospatial', 'sample_mflix', 'sample_restaurants', 'sample_supplies', 'sample_training', 'sample_weatherdata', 'admin', 'local']
['ml4physics']


In [10]:
NEURIPS_URL = "https://papers.nips.cc/"
NEURIPS_URL

'https://papers.nips.cc/'

In [21]:
paper_url_hashs = {
    "hash": [],
    "year": []
}

for y in range(1987, 2021):
    r = requests.get(f"{NEURIPS_URL}paper/{y}")
    if r.status_code == 200:
        soup = BeautifulSoup(r.content, 'html.parser')
        for p in soup.select("div.col li a"):
            url = p['href']
            *_, abstract = url.split("/")
            hash_url, *_ = abstract.split("-")
            paper_url_hashs["hash"].append(hash_url)
            paper_url_hashs["year"].append(y)
    else:
        print("Something went wrong:", r.status_code)
            
paper_refs = pd.DataFrame(paper_url_hashs)
paper_refs.head()

Unnamed: 0,hash,year
0,02e74f10e0327ad868d138f2b4fdd6f0,1987
1,03afdbd66e7929b125f8597834fa83a4,1987
2,072b030ba126b2f4b2374f342be9ed44,1987
3,093f65e080a295f8076b1c5722a46aa2,1987
4,14bfa6bb14875e45bba028a21ed38046,1987


In [19]:
ex_url = "https://papers.nips.cc/paper/1987/file/02e74f10e0327ad868d138f2b4fdd6f0-Metadata.json"

r = requests.get(ex_url)
r.json()

{'title': 'Bit-Serial Neural Networks',
 'book': 'Neural Information Processing Systems',
 'page_first': 573,
 'page_last': 583,
 'abstract': None,
 'full_text': '573 \n\nBIT - SERIAL NEURAL  NETWORKS \n\nAlan F.  Murray,  Anthony V . W.  Smith  and Zoe F.  Butler. \n\nDepartment of Electrical Engineering,  University of Edinburgh, \n\nThe King\'s Buildings, Mayfield Road,  Edinburgh, \n\nScotland,  EH93JL. \n\nABSTRACT \n\nA  bit  - serial  VLSI  neural  network  is  described  from  an  initial  architecture  for  a \nsynapse array through to silicon layout and board design.  The issues surrounding bit \n- serial  computation,  and  analog/digital  arithmetic  are  discussed  and  the  parallel \ndevelopment  of  a  hybrid  analog/digital  neural  network  is  outlined.  Learning  and \nrecall  capabilities  are  reported  for  the  bit  - serial  network  along  with  a  projected \nspecification  for  a  64  - neuron,  bit  - serial  board  operating  at 20 MHz.  This tech(cid:173)

In [29]:
from pydantic import BaseModel, NoneStr
from typing import List, Tuple, Optional, Union

In [30]:
class DataMl4Physics(BaseModel):
    title: List[NoneStr] = []
    authors: List[NoneStr] = []


class HashYearDataFrame(BaseModel):
    hash: List[NoneStr] = []
    year: List[Optional[int]] = []

In [35]:
data = DataMl4Physics()
data.title.append("apple")
data.dict()

{'title': ['apple'], 'authors': []}

In [42]:
type("https://papers.nips.cc/paper/2019/file/021e1ea77bd91aaa0fc4d01a943a654e-Bibtex.bib") is str

True

In [41]:
# OperationFailure for auth failed

In [46]:
from pathlib import Path
type(Path()) is Path

False

In [51]:
import click
dir(click.Path(path_type=Path))

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'allow_dash',
 'coerce_path_result',
 'convert',
 'dir_okay',
 'envvar_list_splitter',
 'exists',
 'fail',
 'file_okay',
 'get_metavar',
 'get_missing_message',
 'is_composite',
 'name',
 'path_type',
 'readable',
 'resolve_path',
 'split_envvar_value',
 'type',
 'writable']

In [49]:
dir(click.types)

['BOOL',
 'BadParameter',
 'BoolParamType',
 'Choice',
 'CompositeParamType',
 'DateTime',
 'FLOAT',
 'File',
 'FloatParamType',
 'FloatRange',
 'FuncParamType',
 'INT',
 'IntParamType',
 'IntRange',
 'LazyFile',
 'PY2',
 'ParamType',
 'Path',
 'STRING',
 'StringParamType',
 'Tuple',
 'UNPROCESSED',
 'UUID',
 'UUIDParameterType',
 'UnprocessedParamType',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_get_argv_encoding',
 'convert_type',
 'datetime',
 'filename_to_ui',
 'get_filesystem_encoding',
 'get_streerror',
 'open_stream',
 'os',
 'safecall',
 'stat',
 'text_type']

In [5]:
def load_mongo_client(mongo_username: str, mongo_password: str) -> pymongo.MongoClient:
    """Loads MongoDB's client from Mongo credentials

    Parameters
    ----------
    mongo_username : str
        MongoDB's database username
    mongo_password : str
        MongoDB's database password

    Returns
    -------
    pymongo.MongoClient
        Database client of MongoDB
    """
    mongo_uri = f"mongodb+srv://{mongo_username}:{mongo_password}@maincluster.otbuf.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
    print(mongo_uri)
    client = pymongo.MongoClient(mongo_uri)
    return client

def execute_before_any_test():
    mongo_username = os.environ.get("MONGO_USERNAME")
    mongo_password = os.environ.get("MONGO_PASSWORD")
    client = load_mongo_client(mongo_username, mongo_password)
    db = client["test_neurips"]
    for collection in db.list_collection_names():
        db["collection"].drop()
    client.close()
    
    
execute_before_any_test()

mongodb+srv://None:None@maincluster.otbuf.mongodb.net/myFirstDatabase?retryWrites=true&w=majority


OperationFailure: Authentication failed., full error: {'ok': 0, 'errmsg': 'Authentication failed.', 'code': 8000, 'codeName': 'AtlasError'}