# Library of Congress Scraper

## Load Libraries

In [1]:
#pandas for general data wrangling
import pandas as pd

#selenium for robot browsing
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Define Scraping Function

In [17]:
def family_tree():
    
    # Set heroku webdriver options
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
   
    # Set window size
    chrome_options.add_argument("window-size=1400,800")
    # chrome_options.add_argument("--headless")
    
    ####### CSV NAME #######
    # Read in csv of current U.S. newspapers as dataframe
    level_0 = pd.read_csv("current_newspapers_c.csv")
    
    
    # Retrieve only the columns needed
    df_loop_through = level_0[["title", "url", "lccn"]]
    
    # Create a column for lccn of newspapers that currently exist
    df_loop_through['current_lccn'] = df_loop_through['lccn']
    
    # Create a column for level of newspaper lineage. Level 1 is a newspaper immediately preceding a currently operating newspaper, level 2 is a newspaper immediately preceding a Level 1 newspaper, and so on.
    df_loop_through['level'] = "1"
    
    # Define the path to the robot browser (tell it where to find the driver, rename to driver)
    driver = webdriver.Chrome(executable_path = "chromedriver", options = chrome_options)
    
    # Create an empty list for checking duplicates
    dupes_check = []
    
    # Write a for loop that will increase the lineage level by 1 every time the scraper runs through the loops nested below until it has run 20 times
    for lineage_level in range(0,30):
        lineage_value = lineage_level+1
        
        # Create an empty dataframe with column names that match the dataframe that contains current newspapers 
        temp_df = pd.DataFrame(columns=["title", "url", "lccn", "current_lccn", "level"])
        
        # Write a for loop that will iterate through each row in the selected dataframe
        for row in df_loop_through.itertuples():
           
            # Identify that in every row, the second value will be a url
            url = row[2]
           
            # Identify that in every row, the fourth value will be the current lccn
            current_lccn = row[4]
            
            # Tell the robot browser to visit every url
            driver.get(url)
           
            # Direct scraper to the page element that follows the "Preceding Titles:" header
            preceding_titles = driver.find_elements_by_xpath("//dt[text()='Preceding Titles:']/following::dd[1]/ul/li/a")
            
            # Wait for "Preceding Titles" to load
            # preceding_titles = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, "//dt[text()='Preceding Titles:']/following::dd[1]/ul/li/a"))) 
            
            # Write a for loop that gathers information about each link under the "Preceding Titles:" header
            for link in preceding_titles:
                # Grab the text of each link (these are the actual titles)
                link_text = link.text
               
                # Grab each url
                link_url = link.get_attribute("href")
                
                # Grab the lccn by extracting the characters between the penultimate slash and final slash in the url
                link_lccn = link.get_attribute("href").rstrip('/').split('/')[-1]
                
                ##### Check for duplicates #####
                # Add link_url to duplicates check list
                dupes_check.append(link_url)
               
                # Convert duplicates check list to a set. Sets automatically remove duplicates.
                #dupes_set = set(dupes_check)
               
                # Check if the list and set lengths are equal. If they are equal, there are no duplicates. If they are not equal, the list has duplicates.
                if len(dupes_check) != len(set(dupes_check)):
                    dupes_check = list(set(dupes_check))
                   
                else:  
                    # Create a list from these elements
                    preceding_titles_list = [link_text,link_url,link_lccn,current_lccn,lineage_value]
                    
                    # Index list items to the columns that were created in the empty dataframe
                    preceding_titles_series = pd.Series(preceding_titles_list,index=temp_df.columns)
                    
                    # Append series to empty dataframe. Information for each preceding title will appear as a single row.
                    temp_df = temp_df.append(preceding_titles_series,ignore_index=True) 
                    dupes_check = list(set(dupes_check))
                    
                # Write to CSV
                temp_df.to_csv(f'newspaper_lineage_level_{str(lineage_value)}c.csv',index=False,header=True) 
            
        # Overwrite dataframe that was originally fed into the for loop with the temporary dataframe contents. This will prompt the scraper to run through the newly compiled set of urls for the next lineage level. 
        df_loop_through = temp_df
        print(lineage_value)
                                 


In [18]:
# Run function
family_tree()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loop_through['current_lccn'] = df_loop_through['lccn']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loop_through['level'] = "1"


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
