# Introduction
## SBA - Small Business Profiles for the States and Territories

The Office of Advocacy’s Small Business Profiles are an annual analysis of each state’s small business activities. Each profile gathers the latest information from key federal data-gathering agencies to provide a snapshot of small business health and economic activity. This year’s profiles report on state economic growth and employment; small business employment, industry composition, and turnover; plus business owner demographics and county-level employment change. 

https://www.sba.gov/

In [1]:
from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))

from IPython.display import IFrame

In [2]:
import pandas as pd
import multiprocessing
import numpy as np
from multiprocessing.dummy import Pool as ThreadPool
from functools import partial
import math

# Handle s3 or local
import s3fs
from os import listdir
from os.path import isfile, join
import subprocess

# Dataset
This Dataset from the U.S. Small Business Administration (SBA) can be download from this website

https://www.sba.gov/advocacy/small-business-profiles-states-and-territories-2016

# Experiment:
Assess the pros and cons of the most popular libraries to read pdf's

# Packages

pyPDF2

tabula-py

pdfquery


# Downloading the PDF's

In [3]:
import requests
from bs4 import BeautifulSoup

In [4]:
website = 'https://www.sba.gov/advocacy/small-business-profiles-states-and-territories-2016'
web = requests.get(website)

In [5]:
text = BeautifulSoup(web.text, 'html.parser')

In [6]:
links = []
for p in text.select('p'):
    tag = str(p.find('a'))
    if 'https' in tag:
        link = [i for i in tag.split('"') if 'https' in i][0]
        links.append(link)
    
links[:5]

['https://www.sba.gov/sites/default/files/advocacy/all_profiles_10_18_16.pdf',
 'https://www.sba.gov/sites/default/files/advocacy/United_States.pdf',
 'https://www.sba.gov/sites/default/files/advocacy/Alabama.pdf',
 'https://www.sba.gov/sites/default/files/advocacy/Alaska.pdf',
 'https://www.sba.gov/sites/default/files/advocacy/Arizona.pdf']

In [7]:
import urllib
from subprocess import call
pdf = urllib.URLopener()
pdf.retrieve('https://www.sba.gov/sites/default/files/advocacy/Alabama.pdf', "Alabama.pdf")

('Alabama.pdf', <httplib.HTTPMessage instance at 0x7f2335f783f8>)

In [8]:
for link in links:
    f = link.split('/')[-1]
#     pdf.retrieve(link, f)
    print f

all_profiles_10_18_16.pdf
United_States.pdf
Alabama.pdf
Alaska.pdf
Arizona.pdf
Arkansas.pdf
California.pdf
Colorado.pdf
Connecticut.pdf
Delaware.pdf
District_of_Columbia.pdf
Florida.pdf
Georgia.pdf
Hawaii.pdf
Idaho.pdf
Illinois.pdf
Indiana.pdf
Iowa.pdf
Kansas.pdf
Kentucky.pdf
Louisiana.pdf
Maine.pdf
Maryland.pdf
Massachusetts.pdf
Michigan.pdf
Minnesota.pdf
Mississippi.pdf
Missouri.pdf
Montana.pdf
Nebraska.pdf
Nevada.pdf
New_Hampshire.pdf
New_Jersey.pdf
New_Mexico.pdf
New_York.pdf
North_Carolina.pdf
North_Dakota.pdf
Ohio.pdf
Oklahoma.pdf
Oregon.pdf
Pennsylvania.pdf
Rhode_Island.pdf
South_Carolina.pdf
South_Dakota.pdf
Tennessee.pdf
Texas.pdf
Utah.pdf
Vermont.pdf
Virginia.pdf
Washington.pdf
West_Virginia.pdf
Wisconsin.pdf
Wyoming.pdf
US_Territories.pdf
American_Samoa.pdf
Guam.pdf
Northern_Marianas.pdf
Puerto_Rico.pdf
US_Virgin_Islands.pdf
SBP_FAQ_FIN.pdf


## The pdfs

In [9]:
# from IPython.display import IFrame
# IFrame("United_States.pdf", width=1000, height=800)

![alt text](Pages_1-2.png)

![alt text](Pages_3-4.png)

## Path to the files

In [10]:
import sys
sys.path.insert(0,'../')
from Tools.paths import *

In [11]:
def list_files(path,ext = 'pdf'):
    if path.startswith('s3://'):  
        onlyfiles = subprocess.check_output(['aws', 's3', 'ls', path_s3])
        onlyfiles = onlyfiles.split('\n')
        onlyfiles = [f.split(" ")[-1] for f in onlyfiles]
    else:
        onlyfiles = [f for f in listdir(path_local) if isfile(join(path_local, f))]
    onlyfiles = [f for f in onlyfiles if f.endswith('.{}'.format(ext))]
    files = [f.replace('.{}'.format(ext),'') for f in onlyfiles]
    return files

In [12]:
def path(path,name,ext = 'pdf'):
    path_file = '{}{}.{}'.format(path,name,ext)
    return path_file

In [13]:
list_files(path_s3)

['Alabama',
 'Alaska',
 'American_Samoa',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'District_of_Columbia',
 'Florida',
 'Georgia',
 'Guam',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New_Hampshire',
 'New_Jersey',
 'New_Mexico',
 'New_York',
 'North_Carolina',
 'North_Dakota',
 'Northern_Marianas',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Puerto_Rico',
 'Rhode_Island',
 'SBP_FAQ_FIN',
 'South_Carolina',
 'South_Dakota',
 'Tennessee',
 'Texas',
 'US_Territories',
 'US_Virgin_Islands',
 'United_States',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West_Virginia',
 'Wisconsin',
 'Wyoming',
 'all_profiles_10_18_16']