In [1]:
import pandas as pd
from pathlib import Path

In [2]:
is_done = True
i = 0
data_path = None

while is_done:
    try:
        current_path = Path().resolve().parents[i]
    except IndexError:
        is_done = False
    else:
        print(current_path)
        if any([fn.name == "data" for fn in current_path.iterdir()]):
            data_path = current_path / "data"
    finally:
        i += 1
        
print(data_path)

data_path /= "raw"

for fn in data_path.iterdir():
    print(fn)

D:\shadi\PythonProjects\science-trend\analysis
D:\shadi\PythonProjects\science-trend
D:\shadi\PythonProjects
D:\shadi
D:\
D:\shadi\PythonProjects\science-trend\analysis\data
D:\shadi\PythonProjects\science-trend\analysis\data\raw\.gitkeep
D:\shadi\PythonProjects\science-trend\analysis\data\raw\ml4physics.csv
D:\shadi\PythonProjects\science-trend\analysis\data\raw\neurips.csv


In [3]:
csvs = [fn for fn in data_path.iterdir() if fn.suffix == ".csv"]
df_ml4 = pd.read_csv(csvs[0])
df_neurips = pd.read_csv(csvs[1])

df_neurips.head()

Unnamed: 0,hash,year
0,02e74f10e0327ad868d138f2b4fdd6f0,1987
1,03afdbd66e7929b125f8597834fa83a4,1987
2,072b030ba126b2f4b2374f342be9ed44,1987
3,093f65e080a295f8076b1c5722a46aa2,1987
4,14bfa6bb14875e45bba028a21ed38046,1987


## Fetching abstract

In [19]:
from bs4 import BeautifulSoup
import requests

In [20]:
NEURIPS_URL = "https://papers.nips.cc/"

ex = df_neurips.iloc[25]

In [21]:
url = f"{NEURIPS_URL}paper/{ex.year}/hash/{ex.hash}-Abstract.html"
print(url)

https://papers.nips.cc/paper/1987/hash/45c48cce2e2d7fbdea1afc51c7c6ad26-Abstract.html


In [22]:
url = "https://papers.nips.cc/paper/2019/hash/0118a063b4aae95277f0bc1752c75abf-Abstract.html"
r = requests.get(url)

print(r)

<Response [200]>


In [23]:
soup =  BeautifulSoup(r.content, "html.parser")
soup.prettify()

'<!DOCTYPE html>\n<html lang="en">\n <head>\n  <!-- Required meta tags -->\n  <meta charset="utf-8"/>\n  <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>\n  <title>\n   Multi-resolution Multi-task Gaussian Processes\n  </title>\n  <meta content="Multi-resolution Multi-task Gaussian Processes" name="citation_title"/>\n  <meta content="Hamelijnck, Oliver" name="citation_author"/>\n  <meta content="Damoulas, Theodoros" name="citation_author"/>\n  <meta content="Wang, Kangrui" name="citation_author"/>\n  <meta content="Girolami, Mark" name="citation_author"/>\n  <meta content="Advances in Neural Information Processing Systems" name="citation_journal_title"/>\n  <meta content="32" name="citation_volume"/>\n  <meta content="https://papers.nips.cc/paper/2019/file/0118a063b4aae95277f0bc1752c75abf-Paper.pdf" name="citation_pdf_url"/>\n  <meta content="2019" name="citation_publication_date"/>\n  <!-- Bootstrap CSS -->\n  <!-- https://codepen.io/surjithctly/

In [9]:
soup.select_one("div.col")

<div class="col">
<h4>Multi-resolution Multi-task Gaussian Processes</h4>
<p>Part of <a href="/paper/2019">Advances in Neural Information Processing Systems 32  (NeurIPS 2019)</a></p>
<div><a class="btn btn-light btn-sm btn-spacer" href="/paper/2019/file/0118a063b4aae95277f0bc1752c75abf-AuthorFeedback.pdf">AuthorFeedback »</a><a class="btn btn-light btn-sm btn-spacer" download="" href="/paper/2019/file/0118a063b4aae95277f0bc1752c75abf-Bibtex.bib">Bibtex »</a><a class="btn btn-light btn-sm btn-spacer" download="" href="/paper/2019/file/0118a063b4aae95277f0bc1752c75abf-Bibtex.bib">Bibtex »</a><a class="btn btn-light btn-sm btn-spacer" href="/paper/2019/file/0118a063b4aae95277f0bc1752c75abf-MetaReview.html">MetaReview »</a><a class="btn btn-light btn-sm btn-spacer" href="/paper/2019/file/0118a063b4aae95277f0bc1752c75abf-Metadata.json">Metadata »</a><a class="btn btn-light btn-sm btn-spacer" href="/paper/2019/file/0118a063b4aae95277f0bc1752c75abf-Paper.pdf">Paper »</a><a class="btn btn-lig

In [16]:
from sqlalchemy import create_engine, Column, Integer, String, Text, ForeignKey
from sqlalchemy.orm import declarative_base, sessionmaker, relationship

Base = declarative_base()
engine = create_engine('sqlite:///neurips_info.db')
Session = sessionmaker(engine)

class Papers(Base):
    __tablename__ = 'papers'

    id = Column(Integer, primary_key=True)
    hash = Column(String, nullable=False)
    year = Column(Integer, nullable=False)
    title = Column(String)
    authors = relationship("Authors")
    abstract = Column(Text)
    
    def __repr__(self):
        return f"Paper(hash={self.hash}, year={self.year})"
    
class Authors(Base):
    __tablename__ = 'authors'
    id = Column(Integer, primary_key=True)
    firstname = Column(String, nullable=False)
    lastname = Column(String, nullable=False)
    paper = Column(Integer, ForeignKey('papers.id'))
    
    def __repr__(self):
        return f"{self.firstname} {self.lastname}"

Base.metadata.create_all(engine)

In [17]:
from bs4.element import Tag
from typing import Optional, Dict
from tqdm.notebook import trange, tqdm
from collections import namedtuple 

def create_paper_dict() -> Dict:
    return {"title": "",
            "authors": "",
            "abstract": "",
           }

def extract_text(el: Optional[Tag]) -> str:
    if el:
        return el.text.strip()
    
    return ""

def author_entry(author: str) -> Authors:
    author = author.strip()
    firstname, lastname = " ".join(author.split()[:-1]), author.split()[-1]
    return Authors(firstname=firstname, lastname=lastname)

In [56]:
data =  {
    "title": [],
    "authors": [],
    "abstract": []
}

paper_dict = create_paper_dict()
for title, content in zip(soup.select("div.col h4"), soup.select("div.col h4 + p")):
    
    title, content = extract_text(title), extract_text(content)
    

    if "part of" in content.lower():
        paper_dict['title'] = title
    elif "authors" in title.lower():
        paper_dict['authors'] = content
    elif "abstract" in title.lower():
        paper_dict['abstract'] = content
        
paper = Papers(hash="0118a063b4aae95277f0bc1752c75abf", 
               year=2019, 
               title=paper_dict['title'], 
               abstract=paper_dict['abstract'], 
               authors=[author_entry(author) for author in paper_dict['authors'].split(",")])
print(paper)
        
    
data['title'].append(paper_dict['title'])
data['authors'].append(paper_dict['authors'])
data['abstract'].append(paper_dict['abstract'])
        
print(data)
print(len(data['title']))

Paper(hash=0118a063b4aae95277f0bc1752c75abf, year=2019)
{'title': ['Limits to Depth Efficiencies of Self-Attention'], 'authors': ['Yoav Levine, Noam Wies, Or Sharir, Hofit Bata, Amnon Shashua'], 'abstract': ['Self-attention architectures, which are rapidly pushing the frontier in natural language processing, demonstrate a surprising depth-inefficient behavior: Empirical signals indicate that increasing the internal representation (network width) is just as useful as increasing the number of self-attention layers (network depth). In this paper, we theoretically study the interplay between depth and width in self-attention. We shed light on the root of the above phenomenon, and establish two distinct parameter regimes of depth efficiency and inefficiency in self-attention. We invalidate the seemingly plausible hypothesis by which widening is as effective as deepening for self-attention, and show that in fact stacking self-attention layers is so effective that it quickly saturates a capac

In [30]:
session = Session()

for i, row in tqdm(df_neurips.iterrows(), desc="Scraping abstracts", total=df_neurips.shape[0], unit="hash"):
    paper = session.query(Papers).filter_by(hash=row.hash)
    
    if not paper:
        paper_dict = create_paper_dict()
        url = f"{NEURIPS_URL}paper/{row.year}/hash/{row.hash}-Abstract.html"
        r = requests.get(url)
        if r.status_code == 200:
            soup =  BeautifulSoup(r.content, "html.parser")
            for title, content in zip(soup.select("div.col h4"), soup.select("div.col h4 + p")):
                title, content = extract_text(title), extract_text(content)

                if "part of" in content.lower():
                    paper_dict['title'] = title
                elif "authors" in title.lower():
                    paper_dict['authors'] = content
                elif "abstract" in title.lower():
                    paper_dict['abstract'] = content

            # SQL part
            paper = Papers(hash=row.hash, 
                   year=int(row.year), 
                   title=paper_dict['title'], 
                   abstract=paper_dict['abstract'], 
                   authors=([author_entry(author) for author in paper_dict['authors'].split(",")] if paper_dict['authors'] else []))

        else:
            print(f"ERROR. Something went wrong with paper {row.hash} (url = {url}, error code = {r.status_code})")
            paper = Papers(hash=row.hash, 
                   year=int(row.year))

        session.add(paper)
           
session.commit()
# df = pd.DataFrame(data)
# df.to_csv('neurips_info.csv', index=None)

Scraping abstracts:   0%|          | 0/11578 [00:00<?, ?hash/s]

In [25]:
paper_dict['authors']

''

In [14]:
Base.metadata.drop_all(engine)

In [13]:
(df.abstract == "Abstract Unavailable").sum()

3318

In [28]:
session.commit()