# ScrapANDdownloadPDFs

* This notebook will scrape the University of Detroit Mercy's *Black Abolitionist Archive* for ~900 PDFs (after I clean the data only ~750 will remain). 
* The archive organizes its texts alphabetically by author. There are then two more pages you must click through to download the PDF. As a result, I will create three functions to sift through the three pages. I will then put these functions together in a for-loop to collect all of my intended documents.
* I will then download these PDFs to my data folder.

View the full archive here: https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php.

### Notebook set up

In [290]:
import os
import random
from bs4 import BeautifulSoup
import requests
from string import ascii_uppercase 
import time
import json

In [282]:
os.chdir('/Commjhub/jupyterhub/comm318_fall2019/sstrickberger/comm313_S21/comm313_S21_Final_Project')

In [283]:
os.getcwd()

'/Commjhub/jupyterhub/comm318_fall2019/sstrickberger/comm313_S21/comm313_S21_Final_Project'

In [8]:
# We need to create base URLS for later in the process

base_url = 'https://libraries.udmercy.edu/find/special_collections/digital/baa/'
base_url_letter = 'https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?browseBy=DC_creator&letter={}&cloud=n'

In [9]:
# This is how the {} will work...here's an example:
#text = 'test{}'
#text.format('love')

### Function to sift through each letter for author URLS

In [20]:
def find_author_URL(letter_URL):
    """
    
    This function sifts through every letter URL -- 
    e.g. the webpage with all A authors -- 
    to pull out the document page URL.
    
    ARGUMENT: letter_URL is the URL for each letter webpage
    VALUES: URLS for each author's homepage listed on the initial link
    
    """
    
    locate_letter_page = requests.get(letter_URL)
    letter_page_text = BeautifulSoup(locate_letter_page.text, 'html.parser')
    author_codes = letter_page_text.find_all('p')

    author_links = []

    for author_code in author_codes:
        author_link = base_url + author_code.find('a').attrs['href'] #find all author links
        author_links.append(author_link)
        

    
    
    return author_links

### Function to sift through all document of a given author

In [26]:
def find_all_documents(author_URL):
    
    """
    
    This function helps sift through all the documents
    of a given author's page.(Most have one, but some have multiple.)
    At the end, it spits out each specific document page links.
    
    ARGUMENT: author_URL is the webpage on which all documents are listed that they wrote
    VALUES: URLS for each document written by the author of the URL
    
    """
    
    locate_author_documents_page = requests.get(author_URL)
    author_documents_page = BeautifulSoup(locate_author_documents_page.text, 'html.parser')

    all_links = author_documents_page.find_all('a', class_='listLink') #find each link's general location

    document_links = [] #create new list to hold all the document links
    for link in all_links:
        document_link = base_url + link['href'] #paste the base link together with the cleaned up found link
        document_links.append(document_link)
    
    return document_links

In [35]:
## TEST?

#url = "https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term=%22Wake%2C+Ransom+F.%22"
url = 'https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term=%22White%2C+Jacob+C.%2C+d.+1872%22'
documents_URL = find_all_documents(url)

### Function to pull out each PDF and download it

In [304]:
def final_PDF_URL(document_URL, output_folder):
    
    """
    This function provides the final main step of my crawling. 
    It will find the PDF links for URL's given document.
    
    ARGUMENT: document_URL is the page that houses the PDF links
    VALUES: Downlaod  each PDF document.
    """
    
    #LOADING IN THE page with the PDF
    locate_document_page = requests.get(document_URL)
    document_page = BeautifulSoup(locate_document_page.text, 'html.parser')

    document_page.text.find('Click to view PDF')

    if document_page.text.find('Click to view PDF') == -1:
        print('not a PDF')
        return None

    #Search for bolded (aka important) notes regarding the data
    key_terms_dict = {}
    keys = []
    values = []
    for p in document_page.find_all('p'):

        try: 
            if len(p.find('strong')):

                keys.append(p.text.split(": ")[0])
                values.append(p.text.split(": ")[1])

        except:
            print("No 'strong' in this line")

    for i in range(len(keys)):
        key_terms_dict[keys[i]] = values[i]

    # not all have date published...fill that in here
    try:
        if len(key_terms_dict['Date published'])>1:
            key_terms_dict['Data_note'] = "Complete"
    except:
        key_terms_dict['Date published'] = "Unknown"
        key_terms_dict['Data_note'] = "Incomplete"
    
    # Find the PDF Link 
    PDF_URL = document_page.find('td').find('a')['href']
    
    # NEED HELP!!
    # Download PDF LINK -- NEED HELP
    #PDF = requests.get(PDF_URL)
    

    
    # name file by Title (author) and date published
    filename = f'{key_terms_dict["Title"]}_{key_terms_dict["Date published"]}'

    # in case a file already exists, add digits at the end [TAKEN FROM JAKE DIRECTLY]
    if os.path.isfile(os.path.join(output_folder,filename+'.pdf')):
        filename += "_" + str(random.randint(100,999)) 

    # Create the file name in the dictionary [TAKEN FROM JAKE DIRECTLY]
    key_terms_dict['filename'] = filename + '.pdf'

    # save PDF [TAKEN FROM JAKE DIRECTLY]
    print(f'saving -- {filename}.pdf')
    PDF = requests.get(PDF_URL)
    with open(os.path.join(output_folder,filename+'.pdf'),'wb') as pdf:
        pdf.write(PDF.content)
    
    
   # with open(output_folder + filename + '_sample.json','w', encoding='UTF-8') as out:
    #    out.write(json.dumps(filename))
        
    return key_terms_dict


### Now, let's bring these functions together in a for-loop

In [238]:
#ascii_uppercase

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [305]:
document_json

[{'Date published': 'Unknown',
  'Description of file(s)': 'PDF 3 page, 630 word document (text and images)',
  'Keywords': 'Mental Feast; moral development; Philadelphia; women',
  'Newspaper or publication': 'Liberator',
  'Publication type': 'Newspapers; Speeches',
  'Speaker or author': 'Douglass, S. M. (Sarah Mapps), 1806-1882',
  'Subjects': 'Abolitionists--United States; African American abolitionists; Antislavery movements--United States; Slavery; United States--History--19th century',
  'Title': 'Sarah M. Douglass'},
 {'Date published': '1849-09-13',
  'Description of file(s)': 'PDF 2 page, 433 word document (text and images)',
  'Keywords': 'audio; citizenship; Civil rights; education; franchise; freedom; politics; privileges; vote',
  'Newspaper or publication': 'Liberator',
  'Publication type': 'Newspapers; Speeches',
  'Speaker or author': 'Africanus, S. M.',
  'Subjects': 'Abolitionists--United States; African American abolitionists; Antislavery movements--United States;

In [306]:
#test here

document_URL_test = 'https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=756&collectionCode=baa'
#for document_URL in document_page_URL_test:
            
time.sleep(1.5)
print('processing', document_URL_test)
        
doc_retrieve = final_PDF_URL(document_URL_test,'test_data/') # download the doc and get doc info
document_json.append(doc_retrieve) # save doc info to json
            

            
with open('test_data/metadata_sample.json','w', encoding='UTF-8') as out:
    out.write(json.dumps(document_json))

        
# now read through this meta data and pull out relevant info, to sort through docs
            
            
            ##download_pdf = final_PDF_URL(document_URL, 'test_data/') #download and get pdf info
            #document_json.append(download_pdf) # save doc info to json        

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=756&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Henry Hall_1863-11-11_484.pdf


In [307]:
document_json = []

#sift through every letter page for the authors on it
for letter in ascii_uppercase:  
    
    time.sleep(1.5)
    print('processing', letter)
    
    letter_URL = base_url_letter.format(letter) #find the URL for the first letter, etc
    
    authors_URL = find_author_URL(letter_URL) #find the URL for each author on the letter page 

    # sift through the authors page for all of their documents
    for author_URL in authors_URL:
        
        time.sleep(1.5)
        print('processing', author_URL)
    
        documents_URL = find_all_documents(author_URL)
        
        # find PDF document
        for document_URL in documents_URL:
            
            time.sleep(1.5)
            print('processing', document_URL)
        
            download_pdf = final_PDF_URL(document_URL, 'data/') #download and get pdf info
            document_json.append(download_pdf) # save doc info to json    
            
            

processing A
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Africanus%2C+S.+M."
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=533&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- S. M. Africanus_1849-09-13_979.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Alexander%2C+S.+R."
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2164&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Solomon R. Alexander_1840-08-03.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Allan+A.+Lane+to+H%5Benry%5D+W%5Bard%5D+Beecher"
processing https://libraries.udmercy.edu/find/special_collections/digital/ba

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1603&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Amos G. Beman_1847-10-06.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1866&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Amos G. Beman_1839-08-01.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Beman%2C+Jehiel+C."
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1946&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Jehiel C. Beman_1843-08-01.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Bibb%2C+Henry%2C+b.+1815"
processing https://libraries.udmercy.edu/find/special_collecti

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=31&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Wells Brown_1849.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=55&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Wells Brown_1854-10-23.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=74&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Wells Brown_1849-07-16.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=112&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Wells Brown_1859.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=288&collectionCode=baa
No 'strong' in this line
No 'strong

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=507&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Robert Campbell_1861_422.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2060&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Robert Campbell_1861_954.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Caples%2C+Charles+V."
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1208&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles V. Caples_1835-10-05.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Cary%2C+Mary+Ann+Shadd%2C+1823-1893"
processing https://libraries.udmercy.edu/find/sp

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2649&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Samuel E. Cornish_1839.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Craft%2C+William"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=210&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Craft_1851.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=280&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Craft_1851_442.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=565&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Craft_1851_382.pdf
processing https://librarie

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2019&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Alexander Crummell_1861_938.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2149&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Alexander Crummell_1865-07-23.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2273&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Alexander Crummell_1854-07-30.pdf
processing D
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Davis%2C+Samuel+H."
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1468&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Samuel H. Dav

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Douglas%2C+George"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=6&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- George Douglas_1843.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Douglass%2C+H.+Ford"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=213&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- H. Ford Douglass_1854-08-27.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=386&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- H. Ford Douglass_1860.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/it

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Francis%2C+Abner+H."
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=266&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
No 'strong' in this line
saving -- Abner H. Francis_1849-08-17.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=865&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Abner H. Francis_1849-11-12.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1854&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Abner H. Francis_1848-03-02.pdf
processing G
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="G."
processing https://libraries.udmercy.edu

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=626&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Henry Highland Garnet_1865-02-12_569.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=663&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Henry Highland Garnet_1850_487.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=751&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Henry Highland Garnet_1862-05-12_323.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=839&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Henry Highland Garnet_1850_910.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=854&collectionCode=baa
No 'stro

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=219&collectionCode=baa
not a PDF
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Hall%2C+R.%2C+of+California"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=265&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- R. Hall_1856-12-12.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Hall%2C+W.+H.+%28William+Henry%29%2C+fl.+1863-1864"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=756&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Henry Hall_1863-11-11.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1106&collection

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1323&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- James Theodore Holly_1863-07-26.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Hood%2C+J.+W.+%28James+Walker%29%2C+1831-1918"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2645&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- James Walker Hood_1865-09-29.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Howard%2C+Shadrach"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1368&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Shadrach Howard_1863.pdf
processing https://libraries.udmercy

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Jones%2C+Josiah%2C+abolitionist"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1757&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Josiah Jones_1842-08-06.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Jones%2C+Thomas+H."
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2618&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Thomas H. Jones_1850.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Jones%2C+William"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1233&collectionCode=baa
No 

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=141&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- John W. Lewis_1859.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Loguen%2C+Jermain+Wesley"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=118&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Jermain Wesley Loguen_1859.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=128&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Jermain Wesley Loguen_1855.pdf
processing M
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="M."
processing https://libraries.udmercy.edu/find/special_collections/digita

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Morrison%2C+Joseph"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1533&collectionCode=baa
not a PDF
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Myers%2C+Stephen"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=532&collectionCode=baa
not a PDF
processing N
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Nell%2C+William+C.+%28William+Cooper%29%2C+1816-1874."
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=227&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William C. Nell_1841-09-09.pdf
processing https://libraries.udm

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2383&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Nathaniel Paul_1835.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2524&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Nathaniel Paul_1838.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Paul%2C+Thomas%2C+fl.+1841"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2395&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Thomas Paul_1841-01-27.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Paulyon%2C+Miss"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1823&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William P. Powell_1850-10-01.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Powers%2C+Jeremiah"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=967&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
No 'strong' in this line
saving -- Jeremiah Powers_1859-12-19.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Purvis%2C+Robert%2C+1810-1898"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2468&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles Lenox Remond_1856-05-28.pdf
processing https://lib

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=76&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles Lenox Remond_1841.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=98&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles Lenox Remond_1855.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=216&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles Lenox Remond_1857.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=239&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles Lenox Remond_1849-04-22.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=378&collectionCode=baa
No 'strong' in this line
No 'strong'

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Rock%2C+John+S.+%28John+Sweat%29%2C+1825-1866"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=582&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- John S. Rock_1855.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1239&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- John S. Rock_1858.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1912&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- John S. Rock_1857-08-01.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=184&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- John Sweat Rock_1860-01-27.pd

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Remond%2C+Charles+Lenox%2C+1810-1873"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=16&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles Lenox Remond_1847-02-16_555.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=27&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles Lenox Remond_1843_202.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=76&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles Lenox Remond_1841_792.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=98&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Charles 

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1752&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
No 'strong' in this line
saving -- Sarah Parker Remond_1859_970.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1805&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Sarah Parker Remond_1859_282.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1933&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Sarah Parker Remond_1859_843.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Rock%2C+John+S.+%28John+Sweat%29%2C+1825-1866"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=582&collectionCode=baa
No 'strong' in this line
No 'strong'

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2205&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Jeremiah Burke Sanderson_1855-11-20.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Scott%2C+Edward%2C+fl.+1857"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1725&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Edward Scott_1858-08-01.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2401&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Edward Scott_1857.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Sears%2C+Robert"
processing https://libraries.udmercy.edu/find/special_collect

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1378&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- James McCune Smith_1852-01-13.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1482&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- James McCune Smith_1835.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1697&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- James McCune Smith_1838-01.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1895&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- James McCune Smith_1838-05-08.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1958&collectionCode=baa
No 'strong' in this line
No 

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Truth%2C+Sojourner%2C+d.+1883"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2467&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Sojouner Truth_1851.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=417&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Sojourner Truth_1853.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=850&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Sojourner Truth_1854-07-04.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=885&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Sojourner Truth_1863-06-03.pdf
process

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=122&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Samuel Ringgold Ward_1853-06-23.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=178&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Samuel Ringgold Ward_1849-01-22.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=342&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Samuel Ringgold Ward_1850-04-02.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=408&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- Samuel Ringgold Ward_1847-05-11.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=461&collectionCode=baa
No 'strong' in t

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1997&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William G. Hamilton_1834-06-02_508.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Williams%2C+H.+M.%2C+fl.+1852"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2088&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- H. M. Williams_1851-10-06.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Williams%2C+James+M.%2C+Rev."
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=846&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- James M. Williams_1860-12-02.pdf
processing https://libraries.udmerc

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2483&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William G. Allen_1854_764.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Alston%2C+William+Johnson"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2609&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William Johnson Alston_1861-09-26_940.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Anderson%2C+John%2C+b.+1831%3F"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=1767&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- John Anderson_1863_540.pdf
processing https://libraries.udmercy.ed

processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Allan+A.+Lane+to+H%5Benry%5D+W%5Bard%5D+Beecher"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2528&collectionCode=baa
not a PDF
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/index.php?collectionCode=baa&field=DC_creator&term="Allen%2C+William+G.%2C+fl.+1849-1853"
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=290&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William G. Allen_1853-12-01_393.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=368&collectionCode=baa
No 'strong' in this line
No 'strong' in this line
saving -- William G. Allen_1853_796.pdf
processing https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id

### Download and get a sense of the data

In [312]:
with open('data/metadata.json','w', encoding='UTF-8') as out:
    out.write(json.dumps(document_json))

In [313]:
len(document_json)

906

In [310]:
document_json[1]

{'Data_note': 'Complete',
 'Date published': '1840-08-03',
 'Description of file(s)': 'PDF 2 page, 553 word document (text and images)',
 'Keywords': 'audio; Britain; emancipation; West Indies',
 'Newspaper or publication': 'Liberator',
 'Publication type': 'Newspapers; Speeches',
 'Speaker or author': 'Alexander, S. R.',
 'Subjects': 'Abolitionists--United States; African American abolitionists; Antislavery movements--United States; Slavery; Slaves--Emancipation--United States; United States--History--19th century',
 'Title': 'Solomon R. Alexander',
 'filename': 'Solomon R. Alexander_1840-08-03.pdf'}

## scratch work for the final function

In [273]:

#locate_document_page = requests.get(documents_URL[0])
#document_page = BeautifulSoup(locate_document_page.text, 'html.parser')

#document_page.text.find('Click to view PDF')


1190

In [274]:
#document_page.find('td').find('a')['href']

'http://libraries.udmercy.edu/digital_collections/baa/Hubbard_25793spe.pdf'

In [260]:
#document_URL= 'https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2135&collectionCode=baa'
#locate_document_page = requests.get(document_URL)
#document_page = BeautifulSoup(locate_document_page.text, 'html.parser')

#key_terms_dict = {}
#keys = []
#values = []
#for p in document_page.find_all('p'):
    

#    try: 
#        if len(p.find('strong')):

#            keys.append(p.text.split(": ")[0])
#            values.append(p.text.split(": ")[1])
#               
#    except:
#        print("No 'strong' in this line")

#for i in range(len(keys)):
##    key_terms_dict[keys[i]] = values[i]
        
#key_terms_dict

No 'strong' in this line
No 'strong' in this line


{'Description of file(s)': 'PDF 3 page, 630 word document (text and images)',
 'Keywords': 'Mental Feast; moral development; Philadelphia; women',
 'Newspaper or publication': 'Liberator',
 'Publication type': 'Newspapers; Speeches',
 'Speaker or author': 'Douglass, S. M. (Sarah Mapps), 1806-1882',
 'Subjects': 'Abolitionists--United States; African American abolitionists; Antislavery movements--United States; Slavery; United States--History--19th century',
 'Title': 'Sarah M. Douglass'}

In [264]:
#try:
#    if len(key_terms_dict['Date published'])>1:
#        print("Has date published")
#except:
#    key_terms_dict['Date published'] = "Unknown"
#    print('new made')
#    print(key_terms_dict['Date published'])

Has date published


In [255]:
#LOADING IN THE page with the PDF

#document_URL = 'https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=2135&collectionCode=baa'
#document_URL= 'https://libraries.udmercy.edu/find/special_collections/digital/baa/item.php?record_id=963&collectionCode=baa'
#locate_document_page = requests.get(document_URL)
#document_page = BeautifulSoup(locate_document_page.text, 'html.parser')

#document_page.text.find('Click to view PDF')

#if document_page.text.find('Click to view PDF') == -1:
#    print('not a PDF')
#    #return none

#document_page.text
#pick out key info regarding the data_set

ConnectionError: HTTPSConnectionPool(host='libraries.udmercy.edu', port=443): Max retries exceeded with url: /find/special_collections/digital/baa/item.php?record_id=963&collectionCode=baa (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f167186a8d0>: Failed to establish a new connection: [Errno -2] Name or service not known'))

In [223]:
#Search for bolded (aka important) notes regarding the data
#key_terms_dict = {}
#keys = []
#values = []
#for p in document_page.find_all('p'):
    

#    try: #
#        if len(p.find('strong')):

 #           keys.append(p.text.split(": ")[0])
  #          values.append(p.text.split(": ")[1])
               
  #  except:
  #      print("No 'strong' in this line")

#for i in range(len(keys)):
#    key_terms_dict[keys[i]] = values[i]
        
#key_terms_dict

No 'strong' in this line
No 'strong' in this line


{'Date published': '1859-08',
 'Description of file(s)': 'PDF 4 page, 854 word document (text and images)',
 'Keywords': 'August 1st; British; celebration; emancipation; England; freedom; Jamaica; West Indies',
 'Newspaper or publication': 'Weekly Anglo-African (1859 - 1862)',
 'People': 'Bleby, Rev. Henry; Clarkson, Thomas; Wilberforce, William',
 'Publication type': 'Newspapers; Speeches',
 'Speaker or author': 'White, Jacob C., d. 1872',
 'Subjects': 'Abolitionists--United States; African American abolitionists; Antislavery movements--United States; Slavery; United States--History--19th century',
 'Title': 'Jacob C. White, Jr.'}

In [235]:
# NEED HELP!!
# Download PDF LINK -- NEED HELP
#PDF = requests.get(PDF_URL)

# name file by Title (author) and date published
#filename = f'{key_terms_dict["Title"]}_{key_terms_dict["Date published"]}'

# in case a file already exists, add digits at the end [TAKEN FROM JAKE DIRECTLY]
#if os.path.isfile(os.path.join('sam_test',filename+'.pdf')):
#    filename += "_" + str(random.randint(100,999))

# Find the PDF Link 
#PDF_URL = document_page.find('td').find('a')['href']

# Create the file name in the dictionary
#key_terms_dict['filename'] = filename + '.pdf'


# save PDF
#print(f'saving -- {filename}.pdf')
#PDF = requests.get(PDF_URL)
#with open(os.path.join('sam_test',filename+'.pdf'),'wb') as pdf:
#    pdf.write(PDF.content)
#return key_terms_dict

saving -- Jacob C. White, Jr._1859-08_587.pdf
