In [1]:
import requests
from bs4 import BeautifulSoup
import json
from os import path
import datetime

In [2]:
LINKS = json.loads(open('website/content/links.json').read())

In [3]:
URL_TO_UID = {}
for link in LINKS:
    URL_TO_UID[link['url_more']] = link['uid']

In [4]:
def get_description(page):
    meta = page.find_all('meta', {"name" : "description"})
    if len(meta) > 0:
        return meta[0].attrs['content']
    meta_og = page.find_all('meta', {"property" : "og:description"})
    if len(meta_og) > 0:
        return meta_og[0].attrs['content']
    return ''

def get_time(page):
    meta = page.find_all('meta', {"property": "article:published_time"})
    if len(meta) > 0:
        time_str = meta[0].attrs['content']
        try:
            d = datetime.datetime.fromisoformat(time_str.replace('Z',''))
        except:
            d = datetime.datetime.now()
    else:
        d = datetime.datetime.now()
    return d.strftime('%Y-%m-%dT%H:%M:%SZ')

def get_title(page):
    title = page.title
    if title:
        return title.text
    return ''

In [5]:
url = 'http://localhost:8081/on-my-website-2018'
soup = BeautifulSoup(requests.get(url).content)
links = soup.find_all('a')[1:]
print(len(links))

17


In [7]:
for link in links:
    url = link.attrs['href']
    text = link.text
    if not (url.startswith('//') or url.startswith('http')):
        print('Link is internal, skipping')
        continue
    if url in URL_TO_UID:
        print('Link already exists, skipping')
        continue
        
    try:
        resp = requests.get(url, timeout=5)
    except:
        print('Could not connect to ', url)
        continue
        
    page = BeautifulSoup(resp.content)
    print(url, text, get_title(page))
    print(get_description(page))
    uid = input("Unique identifier")
    if uid == 'q':
        break
    
    if not path.exists('website/static/thumbnail/' + uid + '.png'):
        ! node screenshot.js --url "$url" --id $uid --path website/static/thumbnail
    else:
        print('Screenshot already exists, skipping')
    
    link_json = {
        '__class__': 'Link',
        'uid': uid,
        'date': get_time(page),
        'title': get_title(page),
        'description': get_description(page),
        'thumbnail': f'/thumbnail/{uid}.png',
        'url_more': url
    }
    
    print(link_json)
    
    print('\n\n\n')
    
    LINKS += [link_json]
    URL_TO_UID[url] = uid
    
# Dump the links out
with open('website/content/links.json', 'w') as out:
    json.dump(LINKS, out, indent=4, sort_keys=True)

Could not connect to  https://pentandra.com/blog/putting-the-pieces-together-technology/
Could not connect to  https://pentandra.com/blog/putting-the-pieces-together-technology/
http://jupyterhub.readthedocs.io/en/latest/ JupyterHub JupyterHub — JupyterHub 1.0.1dev documentation



Unique identifier jupyterhub


Screenshot created of  http://jupyterhub.readthedocs.io/en/latest/
Thumbnail created.
{'__class__': 'Link', 'uid': 'jupyterhub', 'date': '2019-11-14T15:12:04Z', 'title': 'JupyterHub — JupyterHub 1.0.1dev documentation', 'description': '', 'thumbnail': '/thumbnail/jupyterhub.png', 'url_more': 'http://jupyterhub.readthedocs.io/en/latest/'}




https://research.google.com/colaboratory will even be "free" 



Unique identifier colaboratory


Screenshot created of  https://research.google.com/colaboratory
Thumbnail created.
{'__class__': 'Link', 'uid': 'colaboratory', 'date': '2019-11-14T15:12:11Z', 'title': '', 'description': '', 'thumbnail': '/thumbnail/colaboratory.png', 'url_more': 'https://research.google.com/colaboratory'}




https://em.geosci.xyz/content/maxwell2_static/fields_from_grounded_sources_dcr/electrostatic_sphere.html figures Conducting sphere in a uniform electric field — Electromagnetic Geophysics
An open source textbook on applied electromagnetic geophysics. Aimed at providing background and physical understanding for steady state Maxwell equations as they apply to geoscience problems.


Unique identifier geosci-electrostatic-sphere


Screenshot created of  https://em.geosci.xyz/content/maxwell2_static/fields_from_grounded_sources_dcr/electrostatic_sphere.html
Thumbnail created.
{'__class__': 'Link', 'uid': 'geosci-electrostatic-sphere', 'date': '2019-11-14T15:12:28Z', 'title': 'Conducting sphere in a uniform electric field — Electromagnetic Geophysics', 'description': 'An open source textbook on applied electromagnetic geophysics. Aimed at providing background and physical understanding for steady state Maxwell equations as they apply to geoscience problems.', 'thumbnail': '/thumbnail/geosci-electrostatic-sphere.png', 'url_more': 'https://em.geosci.xyz/content/maxwell2_static/fields_from_grounded_sources_dcr/electrostatic_sphere.html'}




https://blog.getpelican.com/ static site generator Pelican Static Site Generator, Powered by Python



Unique identifier python-pelican


Screenshot created of  https://blog.getpelican.com/
Thumbnail created.
{'__class__': 'Link', 'uid': 'python-pelican', 'date': '2019-11-14T15:12:45Z', 'title': 'Pelican Static Site Generator, Powered by Python', 'description': '', 'thumbnail': '/thumbnail/python-pelican.png', 'url_more': 'https://blog.getpelican.com/'}




https://pages.github.com/ GitHub Pages GitHub Pages | Websites for you and your projects, hosted directly from your GitHub repository. Just edit, push, and your changes are live.
Websites for you and your projects, hosted directly from your GitHub repository. Just edit, push, and your changes are live.


Unique identifier github-pages


Screenshot created of  https://pages.github.com/
Thumbnail created.
{'__class__': 'Link', 'uid': 'github-pages', 'date': '2019-11-14T15:12:57Z', 'title': 'GitHub Pages | Websites for you and your projects, hosted directly from your GitHub repository. Just edit, push, and your changes are live.', 'description': 'Websites for you and your projects, hosted directly from your GitHub repository. Just edit, push, and your changes are live.', 'thumbnail': '/thumbnail/github-pages.png', 'url_more': 'https://pages.github.com/'}




https://vimeo.com/232230096 Chalk Talk Ken Perlin:​ Chalktalk in Augmented Reality​ on Vimeo
Chalktalk is now open source - http://frl.nyu.edu/chalktalk-is-now-open-source/


Unique identifier chalk-talk


Screenshot created of  https://vimeo.com/232230096
Thumbnail created.
{'__class__': 'Link', 'uid': 'chalk-talk', 'date': '2019-11-14T15:13:09Z', 'title': 'Ken Perlin:\u200b Chalktalk in Augmented Reality\u200b on Vimeo', 'description': 'Chalktalk is now open source - http://frl.nyu.edu/chalktalk-is-now-open-source/', 'thumbnail': '/thumbnail/chalk-talk.png', 'url_more': 'https://vimeo.com/232230096'}




http://mrl.nyu.edu/~perlin/ Ken Perlin Ken Perlin's homepage



Unique identifier ken-perlin


Screenshot created of  http://mrl.nyu.edu/~perlin/
Thumbnail created.
{'__class__': 'Link', 'uid': 'ken-perlin', 'date': '2019-11-14T15:13:20Z', 'title': "Ken Perlin's homepage", 'description': '', 'thumbnail': '/thumbnail/ken-perlin.png', 'url_more': 'http://mrl.nyu.edu/~perlin/'}




http://schema.org/CreativeWork rigorously defining CreativeWork - schema.org Type
Schema.org Type: CreativeWork - The most generic kind of creative work, including books, movies, photographs, software programs, etc.


Unique identifier schema-creative-work


Screenshot created of  http://schema.org/CreativeWork
Thumbnail created.
{'__class__': 'Link', 'uid': 'schema-creative-work', 'date': '2019-11-14T15:13:35Z', 'title': 'CreativeWork - schema.org Type', 'description': 'Schema.org Type: CreativeWork - The most generic kind of creative work, including books, movies, photographs, software programs, etc.', 'thumbnail': '/thumbnail/schema-creative-work.png', 'url_more': 'http://schema.org/CreativeWork'}




Link already exists, skipping
https://www.force11.org/community/members-directory commoners Community Members | FORCE11
FORCE11 is a community of scholars, librarians, archivists, publishers and research funders that has arisen organically to help facilitate the change toward improved knowledge creation and sharing. Individually and collectively, we aim to bring about a change in modern scholarly communications through the effective use of information technology.


Unique identifier force11-members


Screenshot created of  https://www.force11.org/community/members-directory
Thumbnail created.
{'__class__': 'Link', 'uid': 'force11-members', 'date': '2019-11-14T15:13:54Z', 'title': 'Community Members | FORCE11', 'description': 'FORCE11 is a community of scholars, librarians, archivists, publishers and research funders that has arisen organically to help facilitate the change toward improved knowledge creation and sharing. Individually and collectively, we aim to bring about a change in modern scholarly communications through the effective use of information technology.', 'thumbnail': '/thumbnail/force11-members.png', 'url_more': 'https://www.force11.org/community/members-directory'}




https://dynamicland.org/ Chapter 4 Dynamicland
incubating a humane dynamic medium


Unique identifier dynamicland


Screenshot created of  https://dynamicland.org/
Thumbnail created.
{'__class__': 'Link', 'uid': 'dynamicland', 'date': '2019-11-14T15:14:06Z', 'title': 'Dynamicland', 'description': 'incubating a humane dynamic medium', 'thumbnail': '/thumbnail/dynamicland.png', 'url_more': 'https://dynamicland.org/'}




https://www.mendeley.com/guides/web/04-complete-profile communicate who you are and what you've done as a researcher 03. Complete your profile | Mendeley



Unique identifier mendeley-profile


Screenshot created of  https://www.mendeley.com/guides/web/04-complete-profile
Thumbnail created.
{'__class__': 'Link', 'uid': 'mendeley-profile', 'date': '2019-11-14T15:14:19Z', 'title': '03. Complete your profile | Mendeley', 'description': '', 'thumbnail': '/thumbnail/mendeley-profile.png', 'url_more': 'https://www.mendeley.com/guides/web/04-complete-profile'}




https://lindseyjh.ca/ Lindsey Heagy Lindsey J Heagy
Lindsey Heagy's personal website.


Unique identifier lindsey-heagy


Screenshot created of  https://lindseyjh.ca/
Thumbnail created.
{'__class__': 'Link', 'uid': 'lindsey-heagy', 'date': '2019-11-14T15:14:34Z', 'title': 'Lindsey J Heagy', 'description': "Lindsey Heagy's personal website.", 'thumbnail': '/thumbnail/lindsey-heagy.png', 'url_more': 'https://lindseyjh.ca/'}




Could not connect to  https://pentandra.com
https://twitter.com/EvanBianco Evan Bianco Evan Bianco (@EvanBianco) | Twitter
The latest Tweets from Evan Bianco (@EvanBianco). Agile* | Earth | Subsurface | Geoscience | Machine Learning |. Nova Scotia, Canada


Unique identifier evan-bianco


Screenshot created of  https://twitter.com/EvanBianco
Thumbnail created.
{'__class__': 'Link', 'uid': 'evan-bianco', 'date': '2019-11-14T15:14:51Z', 'title': 'Evan Bianco (@EvanBianco) | Twitter', 'description': 'The latest Tweets from Evan Bianco (@EvanBianco). Agile* | Earth | Subsurface | Geoscience | Machine Learning |. Nova Scotia, Canada', 'thumbnail': '/thumbnail/evan-bianco.png', 'url_more': 'https://twitter.com/EvanBianco'}






In [7]:

# Dump the links out
with open('website/content/links.json', 'w') as out:
    json.dump(LINKS, out, indent=4, sort_keys=True)

In [12]:
! node screenshot.js --url "$url" --id $uid --path website/static/thumbnail

Screenshot created of  https://scholar.google.ca/citations?user=k8DpDMMAAAAJ&hl=en
Thumbnail created.


In [67]:
for x in range(2):
    input('hello')

hello asdf
hello asdf


In [18]:
! node screenshot.js --url $url --id $uid --path website/static/thumbnail

{ _: [],
  url: 'https://www.theguardian.com/science/2017/jun/27/profitable-business-scientific-publishing-bad-for-science',
  id: 'guardian-publishing',
  path: 'website/static/thumbnail',
  '$0': 'screenshot.js' }
Screenshot created of  https://www.theguardian.com/science/2017/jun/27/profitable-business-scientific-publishing-bad-for-science
Thumbnail created.


# Rewrite the original document

In [8]:
def process_links(line):
    soup = BeautifulSoup(line)
    links = soup.find_all('a')

    ws_line = len(line) - len(line.lstrip())

    for link in links:

        url = link.attrs['href']
        if not (url.startswith('//') or url.startswith('http')):
            uid = url.strip('/')
        elif url in URL_TO_UID:
            uid = URL_TO_UID[url]
        else:
            continue
        link.name = 'ink-a'
        if 'target' in link.attrs:
            del link.attrs['target']
        if 'href' in link.attrs:
            del link.attrs['href']
        link.attrs['src'] = '/' + uid
        
    if line.lstrip()[0] == '<':
        out = ' '*ws_line + str(soup).replace('<html><body>','').replace('</body></html>','')
    else:
        out = ' '*ws_line + str(soup).replace('<html><body><p>','').replace('</p></body></html>','')
    
    # print(line)
    # print(out)

    return out

In [9]:
fout = []
for line in open('website/content/on-my-website-2018/content.html'):

    if '<a' in line:
        line = process_links(line)
    fout += [line]

f = open('website/content/on-my-website-2018/content.html', 'w')

for line in fout:
    f.write(line)
f.close()