# Publishing the Unpublishable Scrape

In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

url = 'http://www.ubu.com/ubu/unpub.html'

page_html = urlopen(url).read().decode('utf8')

soup = BeautifulSoup(page_html, 'lxml')

In [None]:
all_links = soup.find_all('a')

all_links

In [None]:
all_urls = [item['href'] for item in all_links]

all_urls

In [None]:
## Converting relative links to absolute links

cleaned_urls = []

for url in all_urls:
    if './' in url:
        url = url.replace('./', 'http://ubu.com/ubu/')
    cleaned_urls.append(url)
        
cleaned_urls

In [None]:
## Filtering out links to things other than PDFs

pdf_urls = [item for item in cleaned_urls if '.pdf' in item.lower()]

pdf_urls

In [None]:
## **Downloading our list of PDFs**
# (This might take a minute or two.)

import os
import subprocess
import time

try: os.mkdir('/sharedfolder/ubu_unpub/')
except: pass

os.chdir('/sharedfolder/ubu_unpub/')

for url in pdf_urls:
    subprocess.call(['wget', url])
    time.sleep(0.5)

In [None]:
len(pdf_urls)

In [None]:
## Using the image that appears before each line on the page to split 
# the source code into chunks corresponding to each publication

link_lines = page_html.split('<img src="images/arrow_orange.gif" border="0" align="middle">')[1:]

len(link_lines) ## Checking the length of our list; it should be the same as the number of PDFs.

In [None]:
## Choosing a line from the page at random

import random

line = random.choice(link_lines)

line

In [None]:
## Figuring out how to extract the author field

creator = line.split('</font>')[1].split('<a href')[0].strip().split('<br')[0]

creator

In [None]:
## Figuring out how to extract the title field

item_soup = BeautifulSoup(line, 'lxml')

title = item_soup.find('a').get_text()

title

In [None]:
## Figuring out how to extract the PDF filename

filename = item_soup.find('a')['href'].split('/')[-1]

filename

In [None]:
## Putting it all together to generate a metadata table
# We're creating a table in the form of a list of lists, each row represented by a lists of fields

metadata_lol = []

for line in link_lines:
    try:
        creator = line.split('</font>')[1].split('<a href')[0].strip().split('<br')[0]
        item_soup = BeautifulSoup(line, 'lxml')
        link = item_soup.find('a')
        title = link.get_text()
        filename = link['href'].split('/')[-1]
        row = [creator, title, filename]
        metadata_lol.append(row)
    except:
        print(line)

# We should get one error along the way, at which point we'll print the offending line.
# In this case we can disregard it.

In [None]:
## Checking the number of rows in our table

len(metadata_lol)

# Q: Why did we find 59 PDFs but only 58 metadata records?

In [None]:
## Writing our metadta table to a CSV file

import csv

header = ["Creator", "Title", "Filename"]

with open('ubu_unpub_metadata.csv', 'w') as file_out:
    csv_writer = csv.writer(file_out)
    csv_writer.writerow(header)
    csv_writer.writerows(metadata_lol)

# Open your newly created CSV in LibreOffice Calc or another spreadsheet 
# program and check whether you need to make any corrections.