# Geo BFS Web Scraper
Author: Ryan Kish

Last Updated: February 30, 2019

### Inputs

In [1]:
SCRAPE_NUMBER = 1
TEMPLATE_CALL = "FILL WITH XML REQUEST"
SHOP_COLUMNS = ['FILL WITH COLUMNS IN XML RESPONSE']

### Libraries

In [2]:
import json
import zipcodes
from scipy.spatial import ConvexHull, convex_hull_plot_2d
import requests
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
import ast

### Set-up Zipcodes

In [3]:
zips = pd.read_csv("zip_code_database.csv", low_memory=False)
print(zips.shape)
zips.head()

(42632, 15)


Unnamed: 0,zip,type,decommissioned,primary_city,acceptable_cities,unacceptable_cities,state,county,timezone,area_codes,world_region,country,latitude,longitude,irs_estimated_population_2015
0,501,UNIQUE,0,Holtsville,,I R S Service Center,NY,Suffolk County,America/New_York,631,,US,40.81,-73.04,562
1,544,UNIQUE,0,Holtsville,,Irs Service Center,NY,Suffolk County,America/New_York,631,,US,40.81,-73.04,0
2,601,STANDARD,0,Adjuntas,,"Colinas Del Gigante, Jard De Adjuntas, Urb San...",PR,Adjuntas Municipio,America/Puerto_Rico,787939,,US,18.16,-66.72,0
3,602,STANDARD,0,Aguada,,"Alts De Aguada, Bo Guaniquilla, Comunidad Las ...",PR,Aguada Municipio,America/Puerto_Rico,787939,,US,18.38,-67.18,0
4,603,STANDARD,0,Aguadilla,Ramey,"Bda Caban, Bda Esteves, Bo Borinquen, Bo Ceiba...",PR,Aguadilla Municipio,America/Puerto_Rico,787,,US,18.43,-67.15,0


In [None]:
states = zips["state"].unique()
state_zip_codes = {}
for state in states:
    zip_list = {}
    for zipcode in list(zips[zips.state == state].zip):
        new_zip = str(zipcode)
        missing_vals = 5 - len(new_zip)
        new_zip = ("0" * missing_vals) + new_zip
        zip_list[new_zip] = False
    state_zip_codes[state] = zip_list
state_zip_codes.keys()

## Algo

In [None]:
def searchState(state, state_data):
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++\n+++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("State: {}".format(state))

    state_file = open("./Data/Scrape_{}/{}_records.txt".format(SCRAPE_NUMBER, state), "w")
    
    num_requests = 0
    witnessed = set()
    
    for zipcode in state_data.copy().keys():
        # don't request data for zipcodes already requested (edge points already exhuasted)
        if state_data[zipcode]:
            continue
        
        print("    Zipcode: {}".format(zipcode))

        # get lat and long of zipcode
        zip_data = zipcodes.matching((zipcode)) #?????????
        if len(zip_data) == 0 or zip_data[0]['zip_code_type'] =='PO BOX':
            continue

        # breadth-first-search
        queue = [(zip_data[0]["lat"], zip_data[0]["long"])] # rename to queue?
        queue_seen = set()
        while queue:
            current_point = queue.pop()

            resp = []
            for i in range(10):
                resp = requests.get(TEMPLATE_CALL.format(current_point[0], current_point[1]))
                num_requests += 1
                try:
                    resp = resp.json()
                    break
                except:
                    continue
            raw_data = resp["Data"]
            
            
            coordinates = []
            for shop in raw_data:
                pair_coord = (shop["Latitude"], shop["Longitude"])
                shop_zip = shop["Zip"]
                shop_state = shop["State"]
                
                # only consider shops in state
                if not shop_zip in state_data or shop_state != state:
                    continue
                    
                # mark zip codes found in zip code query
                state_data[zipcode] = True

                # if hasn't previously been found or added to queue, save and write
                if not pair_coord in queue_seen and pair_coord not in witnessed:
                    state_file.write(str(shop))
                    state_file.write("\n")
                    coordinates.append(np.array([shop["Latitude"], shop["Longitude"]]))
                    witnessed.add(pair_coord)

            # add edge points to queue
            if len(coordinates) > 0:
                if len(coordinates) > 2:
                    coordinates = np.array(coordinates)
                    hull = ConvexHull(coordinates)
                    for vert_index in hull.vertices:
                        queue.append(tuple(coordinates[vert_index]))
                        queue_seen.add(tuple(coordinates[vert_index]))
                else:
                    queue.append(tuple(coordinates[0]))
                    queue_seen.add(tuple(coordinates[0]))
                    if len(coordinates) > 1:
                        queue.append(tuple(coordinates[1]))
                        queue_seen.add(tuple(coordinates[1]))
            print("        Cumulative number stores: {}".format(len(witnessed)))
            
    print("Number of requests sent: {}".format(num_requests))
    state_file.close()

In [None]:
def parse_abbr(states):
    for state in states:
        searchState(state, state_zip_codes[state])

## Scrape

In [None]:
batch = ['AK','AL','AP']
parse_abbr(batch)

## Create CSV and Count Stores
If error from ".DS_Store", then navigate to scrape folder and execute `$ find . -name '.DS_Store' -type f -delete`


In [None]:
import os.path
from os import path

In [None]:
states.sort()

In [None]:
all_records = pd.DataFrame(columns = SHOP_COLUMNS)

In [None]:
count = 0
for state in states:
    #print(item)
    if path.isfile("./Data/Scrape_{}/{}_records.txt".format(SCRAPE_NUMBER, state)):
        state_count = 0
        file = open("./Data/Scrape_{}/{}_records.txt".format(SCRAPE_NUMBER, state), "r")
        text = file.read()
        if len(text) == 0:
            file.close()
            continue
        file_items = text.strip().split("\n")
        for store in file_items:
            #print(count)
            if len(store) > 0:
                all_records.loc[len(all_records)] = list((ast.literal_eval(store)).values())
                state_count += 1
        file.close()
        print(state_count)
        count += 1
all_records.to_csv("./Data/Scrape_{}.csv".format(SCRAPE_NUMBER))
print("Aggregated: {} files".format(count))