In [None]:
from bs4 import BeautifulSoup
import pathlib
import requests
import json

In [None]:
def remove_excess_from_soup(soup):
    non_pdf_metadata_list = soup.find_all("p", "metadata-only")
    non_pdf_metadata_article_list = [non_pdf_metadata.next_sibling.next_sibling.next_sibling 
                                     for non_pdf_metadata in non_pdf_metadata_list]
    
    for non_pdf_metadata in non_pdf_metadata_list:
        non_pdf_metadata.decompose()
    for non_pdf_metadata_article in non_pdf_metadata_article_list:
        non_pdf_metadata_article.decompose()
        
    non_pdf_link_list = soup.find_all("p", "external")
    non_pdf_link_article_list = [non_pdf_link.next_sibling.next_sibling.next_sibling 
                                 for non_pdf_link in non_pdf_link_list]
    
    for non_pdf_link in non_pdf_link_list:
        non_pdf_link.decompose()
    for non_pdf_link_article in non_pdf_link_article_list:
        non_pdf_link_article.decompose()

    return soup

In [None]:
def grab_senior_project_information(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    clean_soup = remove_excess_from_soup(soup)
    
    pagination = clean_soup.find("div", {"class": "adjacent-pagination"})
    if pagination:
        all_urls = [a.get("href") for a in pagination.find_all("a")]
        all_soups = [remove_excess_from_soup(BeautifulSoup(requests.get(url).content, "html.parser")) 
                     for url in all_urls]
    else: 
        all_soups = [clean_soup]    
    
    pdf_links_in_soups = [soup.find_all("p", "pdf") for soup in all_soups]
    article_links_in_soups = [soup.find_all("p", "article-listing") for soup in all_soups]
    
    grouped_links_in_soups = [{"pdf": pdf, "article": article} 
                              for pdf_links, article_links in zip(pdf_links_in_soups, article_links_in_soups) 
                              for pdf, article in zip(pdf_links, article_links)]
    
    return [{"title": grouped_links["article"].a.getText(),
             "author": grouped_links["article"].a.next_sibling[2:],
             "url": grouped_links["pdf"].a.get("href")} 
            for grouped_links in grouped_links_in_soups]

In [None]:
def grab_collections(url="http://digitalcommons.bard.edu/undergraduate/"):
    links_main = BeautifulSoup(requests.get(url).content, "html.parser").find("div", {"id":"series-home"}).find_all("a", {"class": False})
    links_final = [BeautifulSoup(requests.get(link.get("href")).content, "html.parser").find("div", {"id":"series-home"}).find("a", {"class": False}) for link in links_main if link is not None]
    links_final = filter(None, links_final)
    return [{"name": link.get("title"), 
             "url": link.get("href"), 
             "data": grab_senior_project_information(link.get("href"))} for link in links_final]

In [None]:
def download_pdfs(data):
    for folder in data:
        pathlib.Path("./" + folder["name"]).mkdir()
        for sproj in folder["data"]:
            try:
                with open(("./" 
                           + folder["name"] 
                           + "/" 
                           + sproj["title"].replace(":", "- ").replace("/", "-") 
                           + " by " 
                           + sproj["author"] 
                           + ".pdf"), "wb") as f:
                    f.write(requests.get(sproj["url"]).content)
            except Exception as e:
                print("{}: Unable to download the following file:\n{}".format(e, sproj))

In [None]:
%%time
collections = grab_collections()
with open("all_senior_projects.json", "w") as injson:
    json.dump(collections, injson, sort_keys=True, indent=4)

In [None]:
%%time
download_pdfs(collections)