## Notebook Objectives

Our objective is to scrape bridge height and coordinate data via the surface tracks API.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import json
import re

import time
from selenium import webdriver

## Selenium

View documentation [here](https://selenium-python.readthedocs.io/) to see how to web scrape using the Selenium library. 

In [21]:
# json file contains username and password

with open('st_credentials.json') as creds:    
    credentials = json.load(creds)

In [22]:
# Instantiate webdriver object for chrome browser
driver = webdriver.Chrome()

# Login page
driver.get("https://www.surfacetracks.com/amember/login")

# Locate login elements
user_element = driver.find_element_by_name("amember_login")
pass_element = driver.find_element_by_name("amember_pass")

# Input user credentials
user_element.send_keys(credentials['amember_login'])
pass_element.send_keys(credentials['amember_pass'])

# Click the login button
driver.find_element_by_xpath("//input[@value='Login']").click()

Now that we have our login credentials instantiated into the selenium driver object, we can now access the surface tracks API.

In [23]:
# Sample API url
driver.get('https://www.surfacetracks.com/plus/get-feature.php?id=107368')

In [24]:
# Locate element containing json data
api_get = driver.find_element_by_xpath("//pre[@style='word-wrap: break-word; white-space: pre-wrap;']")

In [25]:
api_get.text

'{"UID":"107368","Type":"Pro","Est":"426","Latitude":"30.43443","Longitude":"-84.28134","ChainName":"Low Clearance","Name":"Low Clearance","Address":"E Bloxham St","City":"Tallahassee","St":"FL","Zip":"32301","Phone":"","Web":"","Directions":"","Cat":"LC12","County":"","RecordID":"","Elev":"","trail":"","hike":"","AmenitiesExtra":"","Comments":"12 - 6 height clearance (136)","brand":"low-clearance","CatName":"","PushPin":"low_12_pushpin","bIcon":"<\\/div>\', LC12Icon)","Keyfilter":"-136%","Pin":"low_12_6_pushpin"}'

## Auto Scrape (w/ Selenium)

Our next goal is to automate the scraping process by iterating the id number in each API url. We want every combination of ID to be scraped so that we ensure all data is recorded.

In [26]:
# We will initially test with the last two digits of the ID
base_url = 'https://www.surfacetracks.com/plus/get-feature.php?id=1073'

num_dict = {
    0:'0',
    1:'1',
    2:'2',
    3:'3',
    4:'4',
    5:'5',
    6:'6',
    7:'7',
    8:'8',
    9:'9'
}

In [27]:
url_list = []

ones_counter = 0
twos_counter = 0

for i in range(1,101):
    
    if ones_counter == 10:
        ones_counter = ones_counter - 10
        twos_counter += 1
    
    url = base_url + num_dict[twos_counter] + num_dict[ones_counter]
    url_list.append(url)
    ones_counter += 1

In [28]:
# All combinations from 107300 to 107399
url_list

['https://www.surfacetracks.com/plus/get-feature.php?id=107300',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107301',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107302',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107303',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107304',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107305',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107306',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107307',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107308',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107309',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107310',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107311',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107312',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107313',
 'https://www.surfacetracks.com/plus/get-feature.php?id=107314',
 'https://www.surfacetrac

In [29]:
json_list = []

for url in url_list:
    
    driver.get(url)
    api_scrape = driver.find_element_by_xpath("//pre[@style='word-wrap: break-word; white-space: pre-wrap;']").text
    json_list.append(api_scrape)
    
    # Time function so that we do not overload the server 
    time.sleep(1.5)

In [30]:
json_list

['null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 'null',
 '{"UID":"107368","Type":"Pro","Est":"426","Latitude":"30.43443","Longitude":"-84.28134","ChainName":"Low Clearance","Name":"Low Clearance","Address":"E Bloxham St","City":"Tallahassee","St":"FL","Zip":"32301","Phone":"","Web":"","Directions":"","Cat":"LC12","County":"","RecordID":"","Elev":"","trail":"","hike":"","AmenitiesExtra":"","Comments":"12 - 6 height clearance (136)","brand":"