In [1]:
from pathlib import Path
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import urllib
import pandas as pd

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [4]:
raw_html = simple_get('https://patents.reedtech.com/pgyb.php#15874')
html = BeautifulSoup(raw_html, 'html.parser')
# with Path("reedtech-downloads.html") as p:
#     p.write_bytes(raw_html)

In [5]:
entries = []

trs = [tr for tr in html.select('tr')]
for tr in trs:
    tds = [td for td in tr.select('td')]
    if len(tds) == 3:
        td1 = tds[0]
        td2 = tds[1]
        td3 = tds[2]
        if td1.a and td1.a['href'].endswith('.tar'):
            entries.append((td1.text, td2.text, td3.text, td1.a['href']))

In [10]:
num_bytes = [int(n[1].replace(",", "")) for n in entries]
print(f"size (TB): {sum(num_bytes)/1e12}")

size (TB): 9.600990700032


In [11]:
for i in range(10):
    print(entries[i])

('grant_yb2_20200114.tar', '10,051,976,704', '01/14/2020', 'downloads/GrantYellowBook/2020/grant_yb2_20200114.tar')
('grant_yb2_20200107.tar', '13,574,686,208', '01/07/2020', 'downloads/GrantYellowBook/2020/grant_yb2_20200107.tar')
('grant_yb2_20191231.tar', '13,850,871,808', '12/31/2019', 'downloads/GrantYellowBook/2019/grant_yb2_20191231.tar')
('grant_yb2_20191224.tar', '9,346,679,296', '12/24/2019', 'downloads/GrantYellowBook/2019/grant_yb2_20191224.tar')
('grant_yb2_20191217.tar', '9,820,823,040', '12/17/2019', 'downloads/GrantYellowBook/2019/grant_yb2_20191217.tar')
('grant_yb2_20191210.tar', '13,638,831,616', '12/10/2019', 'downloads/GrantYellowBook/2019/grant_yb2_20191210.tar')
('grant_yb2_20191203.tar', '13,144,777,216', '12/03/2019', 'downloads/GrantYellowBook/2019/grant_yb2_20191203.tar')
('grant_yb2_20191126.tar', '12,359,827,968', '11/26/2019', 'downloads/GrantYellowBook/2019/grant_yb2_20191126.tar')
('grant_yb2_20191119.tar', '12,700,339,200', '11/19/2019', 'downloads/Gran

In [14]:
df = pd.DataFrame(entries, columns =['filename', 'size', 'date', 'url']) 

In [15]:
df

Unnamed: 0,filename,size,date,url
0,grant_yb2_20200114.tar,10051976704,01/14/2020,downloads/GrantYellowBook/2020/grant_yb2_20200...
1,grant_yb2_20200107.tar,13574686208,01/07/2020,downloads/GrantYellowBook/2020/grant_yb2_20200...
2,grant_yb2_20191231.tar,13850871808,12/31/2019,downloads/GrantYellowBook/2019/grant_yb2_20191...
3,grant_yb2_20191224.tar,9346679296,12/24/2019,downloads/GrantYellowBook/2019/grant_yb2_20191...
4,grant_yb2_20191217.tar,9820823040,12/17/2019,downloads/GrantYellowBook/2019/grant_yb2_20191...
...,...,...,...,...
3460,B34193.tar,66723840,01/01/2013,downloads/GrantYellowBook/2013/B34193.tar
3461,B34192.tar,66160640,01/01/2013,downloads/GrantYellowBook/2013/B34192.tar
3462,B34189.tar,13393920,01/01/2013,downloads/GrantYellowBook/2013/B34189.tar
3463,B34182.tar,76728320,01/01/2013,downloads/GrantYellowBook/2013/B34182.tar
