In [13]:
import numpy as np
import re
import requests
import io
import yaml

from glob import glob
from os.path import exists, join, basename
from bs4 import BeautifulSoup
from shutil import copyfileobj, copyfile
from PIL import Image
from time import sleep
from requests_ip_rotator import ApiGateway, EXTRA_REGIONS, ALL_REGIONS
from tqdm import tqdm

## Package Screenshot

In [46]:
with open('./Interesting.html', 'r') as fp:
    content = fp.read()
    soup = BeautifulSoup(content, 'html.parser')
    
table = soup.find('table')
rows = table.find_all('tr')


In [58]:
gateway = ApiGateway("https://lh3.googleusercontent.com")
gateway.start()

session = requests.Session()
session.mount("https://lh3.googleusercontent.com", gateway)

Starting API gateways in 10 regions.
Using 10 endpoints with name 'https://lh3.googleusercontent.com - IP Rotate API' (10 new).


In [60]:
gateway.shutdown() 

Deleting gateways for site 'https://lh3.googleusercontent.com'.
Deleted 10 endpoints with for site 'https://lh3.googleusercontent.com'.


['gscpqvepih',
 'qjtiexfvmg',
 '15c4spf8qa',
 'np9jcvzv66',
 'bz71fik6fk',
 '00k9n7p6qg',
 'rsheulakgl',
 'dsntxv5jae',
 'g5bad9h7d7',
 'ndevqvh507']

In [59]:
existing_files = set([basename(f).replace('.webp', '') for f in glob('./thumbnails/*.webp')])

for i, row in tqdm(enumerate(rows)):

    name = row.find('td', class_='s2')
    if name is None:
        continue
    
    name = name.text.lower().replace(' ', '-')
    name = name.split('.')[0]
    
    if name in existing_files:
        continue
    
    # Remove the size query from image_src
    image_src = row.find('td', class_='s6').find('img').get('src')
    image_src = re.sub(r'^(.*)=w\d+-h\d+$', r'\1', image_src)
    
    # Save the image as webp
    response = session.get(image_src, stream=True)

    if response.status_code == 200:
        image = Image.open(io.BytesIO(response.content))
        image_file_path = f'./thumbnails/{name}.webp'
        image.save(image_file_path, lossless=False, quality=80)
        sleep(0.5)
    else:
        print(response.status_code, response, name)

101it [02:01,  1.21s/it]


## Paper Screenshot

In [63]:
with open('./Papers.html', 'r') as fp:
    content = fp.read()
    soup = BeautifulSoup(content, 'html.parser')
    
table = soup.find('table')
rows = table.find_all('tr')

In [70]:
gateway = ApiGateway("https://lh3.googleusercontent.com")
gateway.start()

session = requests.Session()
session.mount("https://lh3.googleusercontent.com", gateway)

Starting API gateways in 10 regions.
Using 10 endpoints with name 'https://lh3.googleusercontent.com - IP Rotate API' (10 new).


In [73]:
existing_files = set([basename(f).replace('.webp', '') for f in glob('./thumbnails/*.webp')])

for i, row in tqdm(enumerate(rows), total=len(rows)):
    name = row.find('td', class_='s3')
    if name is None:
        continue
    
    name = name.text.lower().replace(' ', '-')
    name = name.split('.')[0]
    
    if name in existing_files:
        continue
    
    # Remove the size query from image_src
    image = row.find('td', class_='s7')
    
    if image is None:
        continue

    image_src = image.find('img').get('src')
    image_src = re.sub(r'^(.*)=w\d+-h\d+$', r'\1', image_src)
    
    # Save the image as webp
    response = session.get(image_src, stream=True)

    if response.status_code == 200:
        image = Image.open(io.BytesIO(response.content))
        image_file_path = f'./thumbnails/{name}.webp'
        image.save(image_file_path, lossless=False, quality=80)
        sleep(0.5)
    else:
        print(response.status_code, response, name)


100%|██████████| 66/66 [00:48<00:00,  1.36it/s]


In [74]:
gateway.shutdown() 

Deleting gateways for site 'https://lh3.googleusercontent.com'.
Deleted 10 endpoints with for site 'https://lh3.googleusercontent.com'.


['znn1mqmwa5',
 '2bxtdqbpgl',
 '74belsv5cf',
 '8p2sdd9uxc',
 '8g9644cxgh',
 'itjp6b1urd',
 'ihopk6donc',
 'koqcgyqr3h',
 'hiv6m06fi5',
 'ls5zppy2he']

## Parse BibTex

In [3]:
with open('./notebook-va.bib', 'r') as fp:
    lines = [l for l in fp.readlines()]

bibtex_entries = []
cur_entry = ''
for line in lines:
    if line == '\n':
       bibtex_entries.append(cur_entry)
       cur_entry = ''
    
    else:
        cur_entry += line

bibtex_entries.append(cur_entry)

# Create a dictionary that maps bibtex key to the entry
bibtex_dict = {}

for i, entry in enumerate(bibtex_entries):
    first_line = entry.split('\n')[0]
    key = re.sub(r'^@.+{(.+),.*$', r'\1', first_line)
    
    # Also parse the year
    year = int(re.sub(r'^.*(\d{4})[a-z]?$', r'\1', key))
    
    bibtex_dict[key] = {
        'bibtex': entry,
        'year': year
    }

## Parse Meta Data (Packages + Papers)

In [15]:
with open('./Interesting.html', 'r') as fp:
    content = fp.read()
    soup = BeautifulSoup(content, 'html.parser')
    
package_table = soup.find('table')
package_rows = package_table.find_all('tr')

with open('./Papers.html', 'r') as fp:
    content = fp.read()
    soup = BeautifulSoup(content, 'html.parser')
    
paper_table = soup.find('table')
paper_rows = paper_table.find_all('tr')

def create_new_entry():
    return {
        'name': '',
        'githubURL': '',
        'paperURL': '',
        'otherURLs': [],
        'description': '',
        'bibtex': '',
        'sourceType': '',
        'releaseYear': 0,
        'communication': '',
        'materials': [],
        'layouts': [],
        'supportedNotebooks': [],
        'modularity': '',
        'user': '',
        'implementation': '',
        'modularity': ''
    }

In [19]:
entries = []

# Parse papers first
for i, row in tqdm(enumerate(paper_rows), disable=True):

    columns = row.find_all('td')
    if columns is None or len(columns) == 0:
        continue
    
    entry = create_new_entry()
    entry['sourceType'] = 'paper'
    
    # Get the package name
    name = columns[0]
    name = name.text.lower().replace(' ', '-')
    name = name.split('.')[0]
    entry['name'] = name
    
    if name is None or name == '' or name == 'name':
        continue
    
    # Get the paper url
    tags = columns[1]
    tags = tags.find_all('a', recursive=True)
    urls = [tag.get('href') for tag in tags]
    
    for url in urls:
        if entry['paperURL'] == '':
            entry['paperURL'] = url
            
    # Get the description
    description = columns[3].text
    entry['description'] = description
    
    # Get the github url
    tag = columns[4]
    tag = tag.find('a', recursive=True)
    if tag is not None:
        entry['githubURL'] = tag.get('href')
    
    # # Get the bibtex
    bibtex_key = columns[2].text
    
    try:
        entry['bibtex'] = bibtex_dict[bibtex_key]['bibtex']
        entry['releaseYear'] = bibtex_dict[bibtex_key]['year']
    
    except:
        print(bibtex_key)
            
    entries.append(entry)

In [20]:
# Prase packages
existing_names = set([entry['name'] for entry in entries])

for i, row in tqdm(enumerate(package_rows), disable=True):

    columns = row.find_all('td')
    if columns is None or len(columns) == 0:
        continue
    
    entry = create_new_entry()
    entry['sourceType'] = 'package'
    
    # Get the package name
    name = columns[0]
    name = name.text.lower().replace(' ', '-')
    name = name.split('.')[0]
    
    # skip duplicates from papers
    if name in existing_names:
        print('Duplicate: ', name)
        continue

    entry['name'] = name
    
    if name is None or name == '' or name == 'module':
        continue
    
    # Get the package urls
    tags = columns[2]
    tags = tags.find_all('a', recursive=True)
    urls = [tag.get('href') for tag in tags]
    
    for url in urls:
        if 'github.com' in url:
            if entry['githubURL'] == '':
                entry['githubURL'] = url
            else:
                entry['otherURLs'].append(url)    
        else:
            entry['otherURLs'].append(url)
            
    # Get the description
    description = columns[3].text
    entry['description'] = description
    
    # Get the bibtex
    bibtex_key = columns[4].text
    
    try:
        entry['bibtex'] = bibtex_dict[bibtex_key]['bibtex']
        entry['releaseYear'] = bibtex_dict[bibtex_key]['year']
    
    except:
        print(bibtex_key)
            
    entries.append(entry)

Duplicate:  interpretml
Duplicate:  visual-auditor
Duplicate:  pipelineprofiler
Duplicate:  vizseq
Duplicate:  what-if-tool


In [22]:
with open('./resources/supernova.yaml', 'w+') as fp:
    for entry in entries:
        fp.write(yaml.dump([entry]))
        fp.write('\n')