# UbuWeb Film Scrape

## *Breaking it Down*

In [None]:
## First, let's create a list of URLs for individual films on UbuWeb

from bs4 import BeautifulSoup
from urllib.request import urlopen

item_url = 'http://www.ubu.com/film/'

page_html = urlopen(item_url).read().decode('utf8')

soup = BeautifulSoup(page_html, 'lxml')

In [None]:
all_links = soup.find_all('a')

all_links

In [None]:
all_urls = [item['href'] for item in all_links]

all_urls

In [None]:
## Converting relative links to absolute links

cleaned_urls = []

for url in all_urls:
    if 'http' not in url:
        url = url.replace('./', '')
        url = 'http://ubu.com/film/' + url
    cleaned_urls.append(url)
        
cleaned_urls

In [None]:
creator_urls = [item for item in cleaned_urls if item!='http://www.ubu.com']

creator_urls

In [None]:
len(creator_urls)

In [None]:
## Now let's figure out how to parse the page of an individual artist
# Choosing an artist url at random:

import random

creator_url = random.choice(creator_urls)

creator_url

In [None]:
## Downloading the page and creating a BeautifulSoup object

page_html = urlopen(creator_url).read().decode('utf8')

creator_soup = BeautifulSoup(page_html, 'lxml')

In [None]:
page_title = creator_soup.find('title').get_text()

page_title

In [None]:
creator = page_title.replace('UbuWeb Film & Video: ', '')

creator

In [None]:
item_links = creator_soup.find_all('a')

item_links

In [None]:
## Let's try that agian, filtering out the irrelevant links
# (Note that all links to individual works on UbuWeb Film appear to be relative links.)

item_urls = ['http://www.ubu.com/film/' + item['href'] for item in item_links if 'http' not in item['href']]

item_urls

In [None]:
## Now let's choose a single film's URL at random and figure out how to parse the page

item_url = random.choice(item_urls)

item_url

In [None]:
## Downloading the page and creating a BeautifulSoup object

page_html = urlopen(item_url).read().decode('utf8')

item_soup = BeautifulSoup(page_html, 'lxml')

In [None]:
film_title = ':'.join(item_soup.find('title').get_text().split(':')[1:]).split(' - ')[1]

film_title

In [None]:
video_links = item_soup.find_all('a', {"class": "navmovie"})

video_links

In [None]:
video_file_urls = [item['href'] for item in video_links]

video_file_urls

## *Running the Scrape*

In [None]:
## Now let's create a function that takes the URL for a creator's UbuWeb Film
# page and returns a list of lists containing metadata for each title

from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
import time

creator_url = random.choice(creator_urls)

def get_creator_metadata(creator_url):
    temp_metadata_lol = []
    try:
        page_html = urlopen(creator_url).read().decode('utf8')
    except:
        page_html = unicode(urlopen(creator_url).read(), errors='ignore')
    creator_soup = BeautifulSoup(page_html, 'lxml')
    page_title = creator_soup.find('title').get_text()
    creator = page_title.replace('UbuWeb Film & Video: ', '')
    item_links = creator_soup.find_all('a')
    item_urls = ['http://www.ubu.com/film/' + item['href'] for item in item_links if 'http' not in item['href']]
    for item_url in item_urls:
        page_html = urlopen(item_url).read().decode('utf8')
        item_soup = BeautifulSoup(page_html, 'lxml')
        try:
            film_title = ':'.join(item_soup.find('title').get_text().split(':')[1:]).split(' - ')[1]
        except:
            film_title = item_soup.find('title').get_text()
        video_links = item_soup.find_all('a', {"class": "navmovie"})
        video_file_urls = [item['href'] for item in video_links]
        row = [creator, film_title, '|'.join(video_file_urls), item_url]
        temp_metadata_lol.append(row)
        time.sleep(0.1)
    return temp_metadata_lol

get_creator_metadata(creator_url)

In [None]:
master_film_metadata_lol = []

        header = ["Creator", "Title", "Video File URLs", "Item Page URL"]

for url in creator_urls:
    try:
        master_film_metadata_lol += get_creator_metadata(creator_url)
        with open('ubu_film_metadata.csv', 'w') as file_out:           ## Writing a new CSV file every time 
            csv_writer = csv.writer(file_out)                          #  we add a new batch of rows to our table
            csv_writer.writerow(header)
        csv_writer.writerows(master_film_metadata_lol)
        print('DONE: ' + url)
    except Exception as e:
        error_row = ['PARSE ERROR: ' + url ,'', '', '']
        master_film_metadata_lol.append(error_row)
        print('** ERROR: ' + url + ' **')
        print(e)
    time.sleep(0.3)

In [None]:
## Looking at 5 randomly selected rows

random.sample(master_film_metadata_lol, 5)