In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import json

In [2]:
def checkdigit(num):
    if num.isdigit():
        return num
    else:
        return "NA"

# Yellow: yellow header stating $2 tickets, Red: change of ticket price from $1 to $2 warning
# change of ticket price from 1/15/12 - PBSalesbystate12012.htm, table 6
def shiftcheck(table_rows):
    if table_rows[0].find('td', attrs={'bgcolor':'Yellow'}) or table_rows[1].find('td', attrs={'bgcolor':'Red'}):
        return 0
    else:
        return 1

In [3]:
def scraper(link):
    data = []
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')
    all_tables = soup.findAll('center')    
    
    # table loop
    for table in all_tables:
        print("table " + str(all_tables.index(table)+1) + " start")
        table_rows = table.table.findAll('tr')        
        shift = shiftcheck(table_rows)
            
        # Deviding table in to entries (date row for reference) to devide columns by date for parsing (entries is just an int)
        entries = len(table_rows[2 - shift].findAll('td', attrs={'colspan':'2'}))
        print("entries: "+ str(entries))

        # looping from right to left: most recent -> past
        for i in range(entries-1,-1,-1):            
            shift = shiftcheck(table_rows)
            
            # jackpot amount
            jackpot_beta = re.findall('\d+', table_rows[1 - shift].findAll('td', attrs={'colspan':'2'})[i].text)
            if len(jackpot_beta) == 1:
                jackpot = jackpot_beta[0]
            else:
                jackpot = 'NA'

            # date
            date = table_rows[2 - shift].findAll('td', attrs={'colspan':'2'})[i].text         
            
            # dividing columns to 2
            col_1, col_2 = i*2, (i*2)+1
            
            # total draw sales & power play sales
            split_sales = table_rows[4 - shift].findAll('td')
            split_sales = [ayy for ayy in split_sales if ayy.text != 'Total\nSales']
            draw_sales = checkdigit(split_sales[col_1].text.replace('*','').replace('$','').replace(',',''))
            pp_sales = checkdigit(split_sales[col_2].text.replace('*','').replace('$','').replace(',',''))
            
            # total tickets sold
            if table.find('tr', attrs={'bgcolor':'Lime'}) == None:
                total_sold = "NA"
                shift += 1
            else:
                total_sold = checkdigit(table_rows[5 - shift].findAll('td', attrs={'colspan':'2'})[i].text.replace(',',''))

            # states sales
            states_data = []
            state_rows = [row for row in table_rows[6 - shift:] if len(row.findAll('td')[0].text) == 2]
            for row in state_rows:
                state_name = row.findAll('td')[0].text
                state_draw_sales = checkdigit(row.findAll('td')[col_1+1].text.replace('$','').replace(',',''))
                state_pp_sales = checkdigit(row.findAll('td')[col_2+1].text.replace('$','').replace(',',''))
                states_data.append({'state':state_name,
                                    'state_draw_sales':state_draw_sales,
                                    'state_pp_sales':state_pp_sales})


            data.append({'date':date,
                         'jackpot':jackpot,
                         'total_sold':total_sold,
                         'draw_sales':draw_sales,
                         'pp_sales':pp_sales,
                         'states_data':states_data})

    print('~-~-~ {} COMPLETE!! ~-~-~'.format(link))
    return data

In [4]:
home_url = 'http://www.lottoreport.com/'
links = ['PBSalesbystate.htm','PBSalesbystate2016.htm',
'PBSalesbystate22015.htm','PBSalesbystate12015.htm',
'PBSalesbystate22014.htm','PBSalesbystate12014.htm',
'PBSalesbystate22013.htm','PBSalesbystate12013.htm',
'PBSalesbystate22012.htm','PBSalesbystate12012.htm',
'PBSalesbystate22011.htm','PBSalesbystate12011.htm',
'PBSalesbystate22010.htm','PBSalesbystate12010.htm']

total_data = []
# link loop
for link in links:
    print("******************************")
    print("link " + str(links.index(link)+1) + " start")
    link_current = home_url + link
    total_data.append({'link':link, 'data':scraper(link_current)})

******************************
link 1 start
table 1 start
entries: 9
table 2 start
entries: 9
table 3 start
entries: 9
table 4 start
entries: 9
table 5 start
entries: 9
table 6 start
entries: 9
table 7 start
entries: 9
table 8 start
entries: 9
table 9 start
entries: 9
table 10 start
entries: 9
table 11 start
entries: 9
~-~-~ http://www.lottoreport.com/PBSalesbystate.htm COMPLETE!! ~-~-~
******************************
link 2 start
table 1 start
entries: 2
table 2 start
entries: 9
table 3 start
entries: 9
table 4 start
entries: 9
table 5 start
entries: 9
table 6 start
entries: 9
table 7 start
entries: 9
table 8 start
entries: 9
table 9 start
entries: 9
table 10 start
entries: 9
table 11 start
entries: 9
table 12 start
entries: 9
table 13 start
entries: 9
~-~-~ http://www.lottoreport.com/PBSalesbystate2016.htm COMPLETE!! ~-~-~
******************************
link 3 start
table 1 start
entries: 6
table 2 start
entries: 9
table 3 start
entries: 9
table 4 start
entries: 9
table 5 start
entrie

In [5]:
json_data = json.dumps(total_data)