# UbuWeb Film Scrape

## *Breaking it Down*

In [None]:
## First, let's create a list of URLs for individual films on UbuWeb

from bs4 import BeautifulSoup
from urllib.request import urlopen

item_url = 'http://www.ubu.com/film/'

page_html = urlopen(item_url).read().decode('utf8')

soup = BeautifulSoup(page_html, 'lxml')

In [None]:
## Creating a list of all links on the page

all_links = soup.find_all('a')

all_links

In [None]:
## Using a list comprehension to extract the URL from each link

all_urls = [item['href'] for item in all_links]

all_urls

In [None]:
## Converting relative links to absolute links

cleaned_urls = []

for url in all_urls:
    if 'http' not in url:
        url = url.replace('./', '')
        url = 'http://ubu.com/film/' + url
    cleaned_urls.append(url)
        
cleaned_urls

In [None]:
## Filtering out links to 'http://www.ubu.com'

creator_urls = [item for item in cleaned_urls if item!='http://www.ubu.com']

creator_urls

In [None]:
## Removing duplicates

creator_urls = list(set(creator_urls))

len(creator_urls)

## *Parsing a creator page*

In [None]:
## Now let's figure out how to parse the page of an individual filmmaker

# Choosing an artist url at random:

import random

creator_url = random.choice(creator_urls)

creator_url

In [None]:
## Downloading the page and creating a BeautifulSoup object

page_html = urlopen(creator_url).read().decode('utf8')

creator_soup = BeautifulSoup(page_html, 'lxml')

In [None]:
## Extracting the contents of the <title></title> tags

page_title = creator_soup.find('title').get_text()

page_title

In [None]:
## Extracting the creator's name from the page title

creator = page_title.replace('UbuWeb Film & Video: ', '')

creator

In [None]:
## Creating a list of all links on the page

item_links = creator_soup.find_all('a')

item_links

In [None]:
## Let's try that agian, filtering out the irrelevant links
# (Note that all links to individual works on UbuWeb Film appear to be relative links.)

item_urls = ['http://www.ubu.com/film/' + item['href'] for item in item_links if 'http' not in item['href']]

item_urls

## *Parsing an item page*

In [None]:
## Now let's choose a single film's URL at random and figure out how to parse an individual film's page

item_url = random.choice(item_urls)

item_url

In [None]:
## Downloading the page and creating a BeautifulSoup object

page_html = urlopen(item_url).read().decode('utf8')

item_soup = BeautifulSoup(page_html, 'lxml')

# (This might return an error from time to time, due to charcater encoding issues. 
# We'll deal with this issue in the next cell.

In [None]:
## Using try/except blocks to neatly handle text encoding issues

try:
        page_html = urlopen(item_url).read().decode('utf8')
except:
        page_html = str(urlopen(item_url).read(), errors='replace')

item_soup = BeautifulSoup(page_html, 'lxml')

In [None]:
## Creating a recipe to locate the film's title, using the contents of the HTML `<title></title>` tags

film_title = ':'.join(item_soup.find('title').get_text().split(':')[1:]).split(' - ')[1]

film_title

In [None]:
## Extracting one or more links to videos, as identified by the HTML `<span class="worknav"></span>` tags

video_links = item_soup.find_all('a', {"class": "navmovie"})

video_links

In [None]:
## Creating a list of URLs for video files
# (This will usually just be one URL, pointing to an MP4, M4V, or AVI video.)

video_file_urls = [item['href'] for item in video_links]

video_file_urls

## *Running the Metadata Scrape*

In [None]:
## Now let's create a function that takes the URL for a creator's UbuWeb Film
# page and returns a list of lists containing metadata for each title

from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
import time
import os

def get_creator_metadata(creator_url):
    temp_metadata_lol = []
    try:
        page_html = urlopen(creator_url).read().decode('utf8')
    except:
        page_html = str(urlopen(creator_url).read(), errors='replace')
    creator_soup = BeautifulSoup(page_html, 'lxml')
    page_title = creator_soup.find('title').get_text()
    creator = page_title.replace('UbuWeb Film & Video: ', '')
    item_links = creator_soup.find_all('a')
    item_urls = ['http://www.ubu.com/film/' + item['href'] for item in item_links if 'http' not in item['href']]
    for item_url in item_urls:
        try:
            page_html = urlopen(item_url).read().decode('utf8')
        except:
            page_html = str(urlopen(creator_url).read(), errors='replace')
        item_soup = BeautifulSoup(page_html, 'lxml')
        try:
            film_title = ':'.join(item_soup.find('title').get_text().split(':')[1:]).split(' - ')[1]
        except:
            film_title = item_soup.find('title').get_text()
        video_links = item_soup.find_all('a', {"class": "navmovie"})
        video_file_urls = [item['href'] for item in video_links]
        row = [creator, film_title, '|'.join(video_file_urls), item_url]  ## If a page links to multiple video files, we'll separate them with the pipe character.
        temp_metadata_lol.append(row)
        time.sleep(0.1)
    return temp_metadata_lol

In [None]:
## Grabbing film metadata for a randomly chosen creator URL

example_creator_url = random.choice(creator_urls)

get_creator_metadata(example_creator_url)

In [None]:
## Running the batch metadata scrape and writing to a CSV as we go

try: os.mkdir('/sharedfolder/ubu_film/')
except: pass

os.chdir('/sharedfolder/ubu_film/')

master_film_metadata_lol = []

header = ["Creator", "Title", "Video File URL(s)", "Item Page URL"]

for url in creator_urls:
    try:
        for row in get_creator_metadata(url):
            master_film_metadata_lol.append(row)
        with open('ubu_film_metadata.csv', 'w') as file_out:           ## Writing a new CSV file every time 
            csv_writer = csv.writer(file_out)                          # we add a new batch of rows to our table.
            csv_writer.writerow(header)                                # (This makes it possible to check our  
            csv_writer.writerows(master_film_metadata_lol)                 # progress and nip problems in the bud.)
        print('DONE: ' + url)
    except Exception as e:
        error_row = ['PARSE ERROR: ' + url ,'', '', '']
        master_film_metadata_lol.append(error_row)
        print('** ERROR: ' + url + ' **')
        print(e)
    time.sleep(0.3)

In [None]:
## Looking at 5 randomly selected rows in our master film metadata table

random.sample(master_film_metadata_lol, 5)

## *Downloading videos*

In [None]:
## Loading video metadata table from the CSV file we just created

import os
import csv

try: os.mkdir('/sharedfolder/ubu_film/')
except: pass

os.chdir('/sharedfolder/ubu_film/')

csv_path = 'ubu_film_metadata.csv'

video_table = []

with open(csv_path) as file_in:
    csv_reader = csv.reader(file_in)
    for row in csv_reader:
        video_table.append(row)


In [None]:
## Checking the table length and viewing a random row

import random

print(len(video_table))

print(random.choice(video_table))

In [None]:
## Moving the header row to a separate variable

header_row = video_table[0]

video_table = video_table[1:]

header_row

In [None]:
## Converting the 'Video File URL(s)' column in our table to a list,
# then removing duplicates and checking its length

video_url_list = [row[2] for row in video_table]

video_url_list = list(set(video_url_list))

len(video_url_list)

In [None]:
## Viewing 5 randomly chosen video URLs

from pprint import pprint

pprint(random.sample(video_url_list, 3))

In [None]:
## Shuffling the order of our URL list

random.shuffle(video_url_list)

video_url_list = [item for item in video_url_list if '.iphone.' not in item]

len(video_url_list)

In [None]:
## Creating a new directory for the files

import os

try: os.mkdir('/sharedfolder/ubu_film/')
except: pass

try: os.mkdir('/sharedfolder/ubu_film/ubu_video_files')
except: pass

os.chdir('/sharedfolder/ubu_film/ubu_video_files')

In [None]:
## Downloading the first 10 videos in our shuffled list

import subprocess
import time

for url in video_url_list[:10]:
    try:
        subprocess.call(['wget', url])
        print('DONE: ' + url)
    except Exception as e:
        print('** ERROR: ' + url+ " **")
        print(3)
    time.sleep(0.4)

# This will probably run fairly slowly.

In [None]:
## Downloading the next 90 videos

import subprocess
import time

for url in video_url_list[10:100]:
    try:
        subprocess.call(['wget', url])
        print('DONE: ' + url)
    except Exception as e:
        print('** ERROR: ' + url+ " **")
        print(3)
    time.sleep(0.4)