# Scraping Union List of Sanborn Maps

This jupyter notebook will have the code to scrap the list of sandborn maps from the Berkley library website and put them into a CSV format. 

In [4]:
#Importing libraries
import requests #Lets us make requests to a website
from bs4 import BeautifulSoup #Importing Beautiful Soup for webscraping

In [5]:
# saving a website url
union_list = "http://www.lib.berkeley.edu/EART/sanborn_union_list"

r = requests.get(union_list)
soup=BeautifulSoup(r.text, 'html.parser')

#Finding all the links on the main page
all_links = []
for link in soup.findAll('a'):
    all_links.append(link.get('href'))
print('All links: %s', len(all_links))


All links: %s 72


In [6]:
state_links = []
for link in all_links:
    if link is not None:
        final_url_element = link.split('/')[-1]
    if final_url_element.startswith('sanbul'):
        state_links.append(link)
        
state_links = state_links[0:54]
#state_links

#How to add to a list
# state_links.append(URL)

In [7]:
# Getting out other links
long_states = []
for link in state_links:
    if link[42:].startswith("_"):
        long_states.append(link)


In [8]:
#Looping through long lists to pull out extras
final_list = []
for state in long_states:
    r = requests.get(state)
    Long_soup=BeautifulSoup(r.text, 'html.parser')

    #Finding all the links on the main page
    all_links = []
    for link in Long_soup.findAll('a'):
         all_links.append(link.get('href'))
            
    target_links = []
    for link in all_links:
        if link is not None:
            if link.startswith("/EART"):
                target_links.append(link)
    target_links.sort()

    target_link = ""
    for link in target_links:
        if link[13:].startswith("lib"):
            continue
        elif link[12:20] != target_link: 
            final_list.append("http://www.lib.berkeley.edu" + link)
            target_link= link[12:20]
            
#Adding long links to master list
for link in final_list:
    state_links.append(link)
    
state_links.sort()

In [9]:
#Looping through the list of URLs

#Importing libraries
import csv #Lets us write to a csv file
import requests #Lets us make requests to a website
from bs4 import BeautifulSoup #Importing Beautiful Soup for webscraping
import re

#Creating placeholder lists and variable to use in the loop
cities = []
dates = []
libs = []
city = ""
date = ""
lib= ""
states = []

for state in state_links:
    #parsing the url
    if state is not None:
        state_list = state
        r = requests.get(state_list)
        soup_state = BeautifulSoup(r.text, 'html.parser')

        #Getting the data in a more workable format
        data = soup_state.find_all(re.compile(r'(p|div)'))
        data_list = list(data)
        
        #Transforming the data to strings - makes next part easier
        string_soup = []
        for item in data_list:
            string= str(item)
            string_soup.append(string)
            
            
        #Creating lists of each value
        for item in string_soup:
            if "city" in item:
                city = item
            elif "date" in item:
                date = item
            elif "class=\"lib\"" in item:
                #Doing the appending in the lib level because there will sometimes be multiple libraries for a specific date/city
                lib =item
                cities.append(city[16:-4])
                dates.append(date[18:-6])
                libs.append(lib[17:-6])
                states.append(state)

#Writing out the results as a CSV file
with open('results.csv','w') as outfile:
    rowlists = zip(states, cities, dates, libs)
    writer = csv.writer(outfile)
    for row in rowlists:
        writer.writerows([row])

# Switching to LoC Sanborn

In [1]:
import requests
from bs4 import BeautifulSoup
    


In [2]:
# Get list of states
loc = "https://www.loc.gov/rr/geogmap/sanborn/"
html = requests.get(loc).text
soup = BeautifulSoup(html, "lxml")
states_html = soup.find("select", {"id": "stateID"}).find_all("option")
state_lookup= {}
for state in states_html:
    state_lookup[state['value']] =  state.text
del(state_lookup['BLANK'])  #  remove the "BLANK" option
state_lookup
                

{'1': 'Alabama',
 '10': 'District of Columbia',
 '11': 'Florida',
 '12': 'Georgia',
 '13': 'Hawaii',
 '14': 'Idaho',
 '15': 'Illinois',
 '16': 'Indiana',
 '17': 'Iowa',
 '18': 'Kansas',
 '19': 'Kentucky',
 '2': 'Alaska',
 '20': 'Louisiana',
 '21': 'Maine',
 '22': 'Maryland',
 '23': 'Massachusetts',
 '29': 'Michigan',
 '3': 'Arizona',
 '30': 'Minnesota',
 '31': 'Mississippi',
 '32': 'Missouri',
 '33': 'Montana',
 '34': 'Nebraska',
 '35': 'Nevada',
 '36': 'New Hampshire',
 '37': 'New Jersey',
 '38': 'New Mexico',
 '39': 'New York',
 '4': 'Arkansas',
 '40': 'North Carolina',
 '41': 'North Dakota',
 '42': 'Ohio',
 '43': 'Oklahoma',
 '44': 'Oregon',
 '45': 'Pennsylvania',
 '46': 'Rhode Island',
 '47': 'South Carolina',
 '48': 'South Dakota',
 '49': 'Tennessee',
 '5': 'California',
 '50': 'Texas',
 '51': 'Utah',
 '52': 'Vermont',
 '53': 'Virginia',
 '54': 'Washington',
 '55': 'West Virginia',
 '56': 'Wisconsin',
 '57': 'Wyoming',
 '7': 'Colorado',
 '8': 'Connecticut',
 '9': 'Delaware'}

In [3]:
state_loc = "https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID={0}"
state_urls = [state_loc.format(state) for state in state_lookup]
state_urls

['https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=45',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=2',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=49',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=33',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=56',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=30',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=29',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=34',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=7',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=38',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=43',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=57',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=1',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=41',
 'https://www.loc.gov/rr/geogmap/sanborn/states.php?stateID=15',
 'https://www.loc.gov/rr/geo

In [5]:
city_urls = []
iteration = 0
city_loc = "https://www.loc.gov/rr/geogmap/sanborn/{0}"
for state_url in state_urls:
    print(iteration)
    iteration += 1
    html = requests.get(state_url).text
    soup = BeautifulSoup(html, "lxml")
    for table in soup.find_all('table')[1:]:
        if "Fire Insurance Maps of" in table.text:
            city_table = table.table  # we want the nested table
            break
    for row in city_table.find_all('tr'):
        city = city_loc.format(row.a['href'])
        city_urls.append(city.replace(" ", "%20"))
        print(city)
        
        
        

0
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Albion&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Aliquippa&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Allentown&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Altoona&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Ambridge&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Annville&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Archbald&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Ashley&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Austin&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Avoca&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Avonmore&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Bangor&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Bath&stateID=45
https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Beaver Fal

In [None]:
# Create a function called "chunks" with two arguments, l and n:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i+n]
        
dates = []
sheets = []
geos = []
comments = []
urls = []
cities = []
states = []
iteration = 0
for loc in city_urls:
    print(iteration, ":", loc)
    iteration += 1
    html = requests.get(loc).text
    soup = BeautifulSoup(html, "lxml")
    for table in soup.find_all('table')[1:]:
        if "Fire Insurance Maps of" in table.text:
            sheet_table = table.table  # we want the nested table
            rows = sheet_table.findAll("td")
            breakdown = list(chunks(rows, 5))
            for item in breakdown:
                number = 0
                while len(item) > number:
                    if number == 0:
                        date = item[0].getText()
                        dates.append(date)

                        final_url_element = link.split('?')[-1]
                        city = final_url_element[5:-11]
                        print('city:',city)
                        state = ''.join(final_url_element.split('=')[2:])
                        print('state:',str(state))

                        states.append(state)
                        cities.append(city)


                    if number == 1:
                        sheet = item[1].getText()
                        sheets.append(sheet)        
                    if number == 2:
                        geo = item[2].getText()
                        geos.append(geo)      
                    if number == 3:
                        comment = item[3].getText()
                        comments.append(comment)
                    if number == 4:
                        url = item[4].getText()
                        urls.append(url)

                    number = number + 1

#Writing out the results as a CSV file
print("outputing csv")
with open('loc.csv','w') as file:
    rowlists = zip(cities, states, dates, sheets, geos, comments, urls)
    writer = csv.writer(file)
    for row in rowlists:
        writer.writerows([row])

        


0 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Albion&stateID=45
1 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Aliquippa&stateID=45
2 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Allentown&stateID=45
3 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Altoona&stateID=45
4 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Ambridge&stateID=45
5 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Annville&stateID=45
6 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Archbald&stateID=45
7 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Ashley&stateID=45
8 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Austin&stateID=45
9 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Avoca&stateID=45
10 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Avonmore&stateID=45
11 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Bangor&stateID=45
12 : https://www.loc.gov/rr/geogmap/sanborn/city.php?CITY=Bath&stateID=45
13 : https

In [189]:
len(city_urls)


4895