Web crawling is a common task. Here I'm using some newer libraries to speed up the job and still keep simple code.

A comparison shows these libraries can download 32 videos in 00:20, compared to about 02:00 using a synchronous, single-threaded pipeline.

In [None]:
%%bash

pip uninstall -y typing  # trouble for gazpacho
pip install asks trio gazpacho

In [None]:
from pathlib import Path

import asks
import trio
import gazpacho as gzp
from tqdm.notebook import tqdm

## Check for existing videos.

In [None]:
Path('/home/vids').mkdir(exist_ok=True)

path_gen = Path('../input/parler').glob('**/*.mp4')
existing = [p.name for p in path_gen]
print(f'{len(existing)} videos in the dataset.')
      

## Use Gazpacho to get links from the index page.

In [None]:
def get_links(start_page):
    html = gzp.get(start_page)
    soup = gzp.Soup(html)
    ancs = soup.find('a')
    refs = [a.attrs['href'] for a in ancs]
    vids_new = [r for r in refs if '.mp4' in r and \
                r not in existing][:32]  # testing
    print(f'Getting {len(vids_new)} videos.')
    return vids_new

## Asynchronous, multi-threaded way to download

In [None]:
async def fetch_vid(s, vid):
    url = f"https://www.tommycarstensen.com/terrorism/{vid}"
    r = await s.get(url)
    return r.content


async def save_vid(s, vid):
    content = await fetch_vid(s, vid)
    filename = f"/home/vids/{vid}"
    with open(filename,'wb') as f:
        f.write(content)

    
async def main(start_page):
    vids_new = get_links(start_page)
    dname = '/'.join(start_page.split('/')[:3])
    s = asks.sessions.Session(dname, connections=16)
    async with trio.open_nursery() as n:
        for vid in vids_new:
            n.start_soon(save_vid, s, vid)


In [None]:
%%time

start_page = 'https://www.tommycarstensen.com/terrorism/index.html'
trio.run(main, start_page)

## Regular way to download

In [None]:
import requests

def fetch_vid(vid):
    url = f"https://www.tommycarstensen.com/terrorism/{vid}"
    r = requests.get(url, stream=True)
    return r.content


def save_vid(vid):
    content = fetch_vid(vid)
    filename = f"/home/vids/{vid}"
    with open(filename,'wb') as f:
        f.write(content)

    
def main(start_page):
    vids_new = get_links(start_page)
    for vid in tqdm(vids_new):
        save_vid(vid)


In [None]:
start_page = 'https://www.tommycarstensen.com/terrorism/index.html'
main(start_page)

In [None]:
%%bash

# zip -r capitol_vids3.zip /home/vids/*.mp4
ls -U  /home/vids | head -10