# UbuWeb Film Scrape

## *Breaking it Down*

In [1]:
## First, let's create a list of URLs for individual films on UbuWeb

from bs4 import BeautifulSoup
from urllib.request import urlopen

item_url = 'http://www.ubu.com/film/'

page_html = urlopen(item_url).read().decode('utf8')

soup = BeautifulSoup(page_html, 'lxml')

In [2]:
all_links = soup.find_all('a')

all_links

[<a href="http://www.ubu.com">UbuWeb</a>,
 <a href="./40.html">40jahrevideokunst.de</a>,
 <a href="./abu-ali.html">Mustafa Abu Ali</a>,
 <a href="./abramovic.html">Marina Abramoviç</a>,
 <a href="./acconci.html">Vito Acconci</a>,
 <a href="./abdoh.html">Reza Abdoh</a>,
 <a href="./adachi.html">Masao Adachi</a>,
 <a href="./adbusters.html">Adbusters</a>,
 <a href="./ader.html">Bas Jan Ader</a>,
 <a href="./adkins.html">Terry Adkins</a>,
 <a href="./ahwesh.html">Peggy Ahwesh</a>,
 <a href="./ahtila.html">Eija-Liisa Ahtila</a>,
 <a href="./aitken.html">Doug Aitken</a>,
 <a href="./aleinikov.html">Igor and Gleb Aleinikov</a>,
 <a href="./alive.html">Alive from Off Center</a>,
 <a href="./allora.html">Allora &amp; Calzadilla</a>,
 <a href="./althamer.html">Paweł Althamer</a>,
 <a href="./alys.html">Francis Alÿs</a>,
 <a href="./amiralay.html">Omar Amiralay</a>,
 <a href="./aaf.html">An American Family</a>,
 <a href="./andersen.html">Erik Anderson</a>,
 <a href="./anderson_l.html">Laurie And

In [3]:
all_urls = [item['href'] for item in all_links]

all_urls

['http://www.ubu.com',
 './40.html',
 './abu-ali.html',
 './abramovic.html',
 './acconci.html',
 './abdoh.html',
 './adachi.html',
 './adbusters.html',
 './ader.html',
 './adkins.html',
 './ahwesh.html',
 './ahtila.html',
 './aitken.html',
 './aleinikov.html',
 './alive.html',
 './allora.html',
 './althamer.html',
 './alys.html',
 './amiralay.html',
 './aaf.html',
 './andersen.html',
 './anderson_l.html',
 './andriessen.html',
 './anger.html',
 './angerame.html',
 './ant_farm.html',
 './antin.html',
 './elsken_appel.html',
 './applebroog.html',
 './arakawa.html',
 './arcangel.html',
 './arnold.html',
 './art-language.html',
 './artensemble.html',
 './artaud.html',
 './avaf.html',
 './ashley.html',
 './assayas.html',
 './astaire_bojangles.html',
 './atkins.html',
 './attar.html',
 './auder.html',
 './ayari.html',
 './b.html',
 './bacon.html',
 './bag.html',
 './bailey.html',
 './baillie.html',
 './bal-blanc.html',
 './baldessari.html',
 './baldwin.html',
 './balerdi.html',
 './ballard.h

In [4]:
## Converting relative links to absolute links

cleaned_urls = []

for url in all_urls:
    if 'http' not in url:
        url = url.replace('./', '')
        url = 'http://ubu.com/film/' + url
    cleaned_urls.append(url)
        
cleaned_urls

['http://www.ubu.com',
 'http://ubu.com/film/40.html',
 'http://ubu.com/film/abu-ali.html',
 'http://ubu.com/film/abramovic.html',
 'http://ubu.com/film/acconci.html',
 'http://ubu.com/film/abdoh.html',
 'http://ubu.com/film/adachi.html',
 'http://ubu.com/film/adbusters.html',
 'http://ubu.com/film/ader.html',
 'http://ubu.com/film/adkins.html',
 'http://ubu.com/film/ahwesh.html',
 'http://ubu.com/film/ahtila.html',
 'http://ubu.com/film/aitken.html',
 'http://ubu.com/film/aleinikov.html',
 'http://ubu.com/film/alive.html',
 'http://ubu.com/film/allora.html',
 'http://ubu.com/film/althamer.html',
 'http://ubu.com/film/alys.html',
 'http://ubu.com/film/amiralay.html',
 'http://ubu.com/film/aaf.html',
 'http://ubu.com/film/andersen.html',
 'http://ubu.com/film/anderson_l.html',
 'http://ubu.com/film/andriessen.html',
 'http://ubu.com/film/anger.html',
 'http://ubu.com/film/angerame.html',
 'http://ubu.com/film/ant_farm.html',
 'http://ubu.com/film/antin.html',
 'http://ubu.com/film/elske

In [5]:
creator_urls = [item for item in cleaned_urls if item!='http://www.ubu.com']

creator_urls

['http://ubu.com/film/40.html',
 'http://ubu.com/film/abu-ali.html',
 'http://ubu.com/film/abramovic.html',
 'http://ubu.com/film/acconci.html',
 'http://ubu.com/film/abdoh.html',
 'http://ubu.com/film/adachi.html',
 'http://ubu.com/film/adbusters.html',
 'http://ubu.com/film/ader.html',
 'http://ubu.com/film/adkins.html',
 'http://ubu.com/film/ahwesh.html',
 'http://ubu.com/film/ahtila.html',
 'http://ubu.com/film/aitken.html',
 'http://ubu.com/film/aleinikov.html',
 'http://ubu.com/film/alive.html',
 'http://ubu.com/film/allora.html',
 'http://ubu.com/film/althamer.html',
 'http://ubu.com/film/alys.html',
 'http://ubu.com/film/amiralay.html',
 'http://ubu.com/film/aaf.html',
 'http://ubu.com/film/andersen.html',
 'http://ubu.com/film/anderson_l.html',
 'http://ubu.com/film/andriessen.html',
 'http://ubu.com/film/anger.html',
 'http://ubu.com/film/angerame.html',
 'http://ubu.com/film/ant_farm.html',
 'http://ubu.com/film/antin.html',
 'http://ubu.com/film/elsken_appel.html',
 'http:/

In [6]:
len(creator_urls)

844

## *Parsing a creator page*

In [7]:
## Now let's figure out how to parse the page of an individual filmmaker
# Choosing an artist url at random:

import random

creator_url = random.choice(creator_urls)

creator_url

'http://ubu.com/film/chang.html'

In [8]:
## Downloading the page and creating a BeautifulSoup object

page_html = urlopen(creator_url).read().decode('utf8')

creator_soup = BeautifulSoup(page_html, 'lxml')

In [9]:
page_title = creator_soup.find('title').get_text()

page_title

'UbuWeb Film & Video - Rob Pruitt'

In [10]:
creator = page_title.replace('UbuWeb Film & Video: ', '')

creator

'UbuWeb Film & Video - Rob Pruitt'

In [11]:
item_links = creator_soup.find_all('a')

item_links

[<a href="http://www.ubu.com/film/index.html"><font color="F00">Film</font></a>,
 <a href="http://www.ubu.com/"><font color="F00">UbuWeb</font></a>,
 <a href="chang_invocation.html">Invocation for a Wandering Lake, Part 1 (2014)</a>,
 <a href="chang_current.html">Current (2012)</a>,
 <a href="chang_minor.html">Minor (2011) </a>,
 <a href="chang_rather.html">Rather to Potentialities (2009)</a>,
 <a href="chang_flotsam.html">Flotsam Jetsam (2007) with David Kelley</a>,
 <a href="chang_shangri.html">Shangri-La (2005) </a>]

In [12]:
## Let's try that agian, filtering out the irrelevant links
# (Note that all links to individual works on UbuWeb Film appear to be relative links.)

item_urls = ['http://www.ubu.com/film/' + item['href'] for item in item_links if 'http' not in item['href']]

item_urls

['http://www.ubu.com/film/chang_invocation.html',
 'http://www.ubu.com/film/chang_current.html',
 'http://www.ubu.com/film/chang_minor.html',
 'http://www.ubu.com/film/chang_rather.html',
 'http://www.ubu.com/film/chang_flotsam.html',
 'http://www.ubu.com/film/chang_shangri.html']

## *Parsing an item page*

In [13]:
## Now let's choose a single film's URL at random and figure out how to parse an individual film's page

item_url = random.choice(item_urls)

item_url

'http://www.ubu.com/film/chang_flotsam.html'

In [14]:
## Downloading the page and creating a BeautifulSoup object

page_html = urlopen(item_url).read().decode('utf8')

item_soup = BeautifulSoup(page_html, 'lxml')

# (This might return an error from time to time, due to charcater encoding issues. 
# We'll deal with this issue in the next cell.

In [15]:
## Using try/except blocks to neatly handle text encoding issues

try:
        page_html = urlopen(item_url).read().decode('utf8')
except:
        page_html = str(urlopen(item_url).read(), errors='replace')

item_soup = BeautifulSoup(page_html, 'lxml')

In [16]:
## Creating a recipe to locate the film's title, using the contents of the HTML `<title></title>` tags

film_title = ':'.join(item_soup.find('title').get_text().split(':')[1:]).split(' - ')[1]

film_title

'Flotsam Jetsam (2007) with David Kelley'

In [17]:
## Extracting one or more links to videos, as identified by the HTML `<span class="worknav"></span>` tags

video_links = item_soup.find_all('a', {"class": "navmovie"})

video_links

[<a class="navmovie" href="http://ubuvideo.memoryoftheworld.org/Chang-Patty_Flotsam-Jetsam.mp4" id="moviename">
 <span class="worknav"><span id="ubuwork">Flotsam Jetsam (2007) with David Kelley</span></span>
 </a>]

In [18]:
## Creating a list of URLs for video files
# (This will usually just be one URL, pointing to an MP4, M4V, or AVI video.)

video_file_urls = [item['href'] for item in video_links]

video_file_urls

['http://ubuvideo.memoryoftheworld.org/Chang-Patty_Flotsam-Jetsam.mp4']

## *Running the Metadata Scrape*

In [None]:
## Now let's create a function that takes the URL for a creator's UbuWeb Film
# page and returns a list of lists containing metadata for each title

from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
import time
import os

def get_creator_metadata(creator_url):
    temp_metadata_lol = []
    try:
        page_html = urlopen(creator_url).read().decode('utf8')
    except:
        page_html = unicode(urlopen(creator_url).read(), errors='replace')
    creator_soup = BeautifulSoup(page_html, 'lxml')
    page_title = creator_soup.find('title').get_text()
    creator = page_title.replace('UbuWeb Film & Video: ', '')
    item_links = creator_soup.find_all('a')
    item_urls = ['http://www.ubu.com/film/' + item['href'] for item in item_links if 'http' not in item['href']]
    for item_url in item_urls:
        try:
            page_html = urlopen(item_url).read().decode('utf8')
        except:
            page_html = unicode(urlopen(creator_url).read(), errors='replace')
        item_soup = BeautifulSoup(page_html, 'lxml')
        try:
            film_title = ':'.join(item_soup.find('title').get_text().split(':')[1:]).split(' - ')[1]
        except:
            film_title = item_soup.find('title').get_text()
        video_links = item_soup.find_all('a', {"class": "navmovie"})
        video_file_urls = [item['href'] for item in video_links]
        row = [creator, film_title, '|'.join(video_file_urls), item_url]  ## If a page links to multiple video files, we'll separate them with the pipe character.
        temp_metadata_lol.append(row)
        time.sleep(0.1)
    return temp_metadata_lol

In [None]:
## Grabbing film metadta for a randomly chosen creator URL

creator_url = random.choice(creator_urls)

get_creator_metadata(creator_url)

In [None]:
## Running the batch metadata scrape and writing to a CSV as we go

try: os.mkdir('/sharedfolder/ubu_film/')
except: pass

os.chdir('/sharedfolder/ubu_film/')

master_film_metadata_lol = []

header = ["Creator", "Title", "Video File URL(s)", "Item Page URL"]

for url in creator_urls:
    try:
        master_film_metadata_lol += get_creator_metadata(creator_url)
        with open('ubu_film_metadata.csv', 'w') as file_out:           ## Writing a new CSV file every time 
            csv_writer = csv.writer(file_out)                          # we add a new batch of rows to our table.
            csv_writer.writerow(header)                                # (This makes it possible to check our  
            csv_writer.writerows(master_film_metadata_lol)                 # progress and nip problems in the bud.)
        print('DONE: ' + url)
    except Exception as e:
        error_row = ['PARSE ERROR: ' + url ,'', '', '']
        master_film_metadata_lol.append(error_row)
        print('** ERROR: ' + url + ' **')
        print(e)
    time.sleep(0.3)

In [None]:
## Looking at 5 randomly selected rows in our master film metadata table

random.sample(master_film_metadata_lol, 5)

## *Downloading videos*

In [19]:
import csv

csv_path = 'ubu_film_metadata.csv'

video_table = []

with open(csv_path) as file_in:
    csv_reader = csv.reader(file_in)
    for row in csv_reader:
        video_table.append(row)


In [20]:
## Checking the table length and viewing a random row

import random

print(len(video_table))

print(random.choice(video_table))

2527
['Steve Reich', 'A New Musical Language (documentary, 1987)', 'http://ubuvideo.memoryoftheworld.org/Reich-Steve_New-Musical-Language_1987.iphone.m4v', 'http://www.ubu.com/film/reich_new.html']


In [21]:
## Moving the header row to a separate variable

header_row = video_table[0]

video_table = video_table[1:]

header_row

['Creator', 'Title', 'Video File URL(s)', 'Item Page URL']

In [22]:
## Converting the 'Video File URL(s)' column in our table to a list,
# then checking its length and 

video_url_list = [row[2] for row in video_table]

len(video_url_list)

2526

In [23]:
## Viewing 5 randomly chosen video URLs

from pprint import pprint

pprint(random.sample(video_url_list, 5))

['http://ubuvideo.memoryoftheworld.org/Logue-Joan_ReichSpot.mp4',
 'http://ubuvideo.memoryoftheworld.org/Logue-Joan_ReichSpot.mp4',
 'http://ubuvideo.memoryoftheworld.org/Logue-Joan_ReichSpot.mp4',
 'http://ubuvideo.memoryoftheworld.org/Reich-Steve-Ensemble-Modern-in-Tokyo.iphone.m4v',
 'http://ubuvideo.memoryoftheworld.org/Reich-Steve-Ensemble-Modern-in-Tokyo.iphone.m4v']


In [28]:
## Shuffling the order of our URL list

random.shuffle(video_url_list)

In [26]:
import os

try: os.mkdir('/sharedfolder/ubu_film/')
except: pass

try: os.mkdir('/sharedfolder/ubu_film/ubu_video_files')
except: pass

os.chdir('/sharedfolder/ubu_film/ubu_video_files')

In [None]:
## Downloading the first 10 videos in our shuffled list

import subprocess
import time

for url in video_url_list[:10]:
    try:
        subprocess.call(['wget', url])
        print('DONE: ' + url)
    except Exception as e:
        print('** ERROR: ' + url+ " **")
        print(3)
    time.sleep(0.4)