In [31]:
from requests import get

from requests.exceptions import RequestException

from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd

import re
import time
from my_secrets import GOOGLE_MAPS_API_KEY
import json

REQUEST_ARGS = dict(
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'},
)


class NoResponseError(Exception):
    pass

class NotFoundError(Exception):
    pass

def simple_get(url,args = REQUEST_ARGS):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url,**args)) as resp: #stream=True
            if is_good_response(resp):
                return resp
            elif resp.status_code == 404:
                print('404')
                raise NotFoundError()
            else:
                print('no response')
                return NoResponseError(f'HTTP status code: {resp.status_code}')

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)



def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

SITE = "http://www.district-trivia.com"

WHERE = "/where/is-trivia"

DAYS = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]

DAY_DICT = dict(
    Mon = 'Monday',Tue = 'Tuesday',
    Wed = 'Wednesday',Thu = 'Thursday',Fri='Friday',
    Sat='Saturday',Sun='Sunday'
)

In [3]:

response = simple_get(SITE+WHERE)
raw_html = response.content
print(len(raw_html))

html = BeautifulSoup(raw_html, 'html.parser')
dcvenues = html.find_all("a",href=re.compile("^/venues/dc"))
mdvenues = html.find_all("a",href=re.compile("^/venues/maryland"))
vavenues = html.find_all("a",href=re.compile("^/venues/virginia"))
vdict = dict(DC=dcvenues,MD=mdvenues,VA=vavenues)
venue_list = []

for st,venues in vdict.items():
    for tag in venues:
        if tag.string!=None:

            response = simple_get(SITE+tag.get('href'))
            raw_html = response.content
            #print(len(raw_html))
            html = BeautifulSoup(raw_html, 'html.parser')

            time_tag = html.find("h5")
            #print(timeTag.text)
            vday,vtime = time_tag.text.split(" at ")
            table = html.find_all('div',attrs={"class":"four columns"})
            # print(table)
            url = table[1].find('a').get('href')
            # fb = table[1].find()
            # addressTag = table[-1].find('p')
            
            # for br in addressTag.find_all("br"):
            #     br.replace_with("\n")

            # venueAddresses[i] = addressTag.text
            #print(addressTag.text)
            time.sleep(1)

            venue_list.append(
                dict(
                    name = tag.string,
                    href = tag.get('href'),
                    url = url,
                    state = st,
                    day = vday,
                    time = vtime,
                )
            )

# venues = dcvenues + mdvenues + vavenues

# venues = [tag for tag in venues if tag.string!=None]
# n = len(venues)
# venueUrls = [tag.get('href') for tag in venues]

# venueNames = [tag.string for tag in venues]
# #print(venues[1].get('href'))
# #print(venues[1].text)
# print(len(venueNames))
# print(venues[:5])
# print(venueNames[:5])
# print(venueUrls[:5])

# venueTimes = [None]*n
# venueDays = [None]*n
# venueAddresses = [None]*n
# i=0
# for tag in venues:  
#     response = simple_get(SITE+tag.get('href'))
#     raw_html = response.content
#     #print(len(raw_html))
#     html = BeautifulSoup(raw_html, 'html.parser')

#     timeTag = html.find("h5")
#     #print(timeTag.text)
#     venueDays[i],venueTimes[i] = timeTag.text.split(" at ")
#     table = html.find_all('div',attrs={"class":"four columns"})
#     addressTag = table[-1].find('p')
    
#     for br in addressTag.find_all("br"):
#         br.replace_with("\n")

#     venueAddresses[i] = addressTag.text
#     #print(addressTag.text)
#     i += 1
#     time.sleep(1)
#     print(i)
#     #for x in table:
#     #    print(x.find('p'))
#     #    addressTag = x.find('p')
#     #addressTags = html.descendants.find("p",string=re.compile("Washington, DC 2"))
#     #print(addressTag.text)
#     #print(addressTag.text)
#     #print(timeTag.text)

# venueData = pd.DataFrame(data={'Venue':venueNames,
#     'Address':venueAddresses,'Day':venueDays,'Time':venueTimes, 'Host':'District Trivia'})


68600


In [4]:
len(venue_list)

31

In [5]:
df_dist = pd.DataFrame.from_records(venue_list)

In [6]:
df_dist = df_dist[['name','url','state','day','time']]
#df_dist['day'] = df_dist['day']#.str.lower()

In [7]:
df_dist

Unnamed: 0,name,url,state,day,time
0,Franklin Hall (Tuesdays),https://www.franklinhalldc.com/,DC,Tuesday,7:00PM
1,Stadium Sports Bar,https://www.stadiumsportsdc.com/,DC,Wednesday,7:00PM
2,Capitol Cider House,https://capitolciderhouse.com/,DC,Thursday,7:00PM
3,The Eleanor (NoMa),https://www.eleanordc.com/,DC,Sunday,7:30PM
4,Across the Pond,https://www.acrosstheponddc.com/,DC,Wednesday,7:30PM
5,Crown & Crow,http://www.thecrownandcrow.com/,DC,Wednesday,7:30PM
6,Astro Beer Hall,https://www.astrobeerhall.com,DC,Wednesday,7:00PM
7,Hook Hall,https://www.hookhall.com/,DC,Thursday,7:00PM
8,Crimson,https://crimson-dc.com/,DC,Tuesday,7:00PM
9,The Fainting Goat,https://www.faintinggoatdc.com/,DC,Wednesday,7:00PM


In [10]:

SITE = 'https://triviakings.com/locations/'

raw_html = simple_get(SITE).content
print(len(raw_html))

html = BeautifulSoup(raw_html, 'html.parser')
venueTable = html.find('table')

dfs = pd.read_html(SITE)

# venueData2 = dfs[0]

# venueData2 = venueData2. 

# venueData2.to_csv('test.csv')

15745


In [11]:
df_king = dfs[0].dropna()

In [12]:
# rename Pandas columns to lower case
df_king['url']= pd.NA
df_king.columns= df_king.columns.str.lower()

df_king = df_king.rename(
    columns = {"location name":"name","pm": "time"}
)
df_king = df_king[['name','url','state','day','time']]

def day_convert(st):
    try:
        return DAY_DICT[st]
    except KeyError:
        return st

df_king['time'] =  df_king['time'].astype(str)+'PM'
df_king['day'] = df_king['day'].apply(day_convert)

df_king




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_king['url']= pd.NA


Unnamed: 0,name,url,state,day,time
1,Madhatter,,DC,Monday,7:00PM
2,Tight Five Pub,,DC,Monday,7:00PM
3,Crystal City Sports Pub,,VA,Monday,8:00PM
5,Irish Channel Pub,,DC,Tuesday,7:00PM
7,Nanny O'Briens,,DC,Tuesday,7:30PM
8,Crafthouse Arlington,,VA,Tuesday,7:30PM
9,Crafthouse Fairfax,,VA,Tuesday,7:30PM
10,Crafthouse Reston,,VA,Tuesday,7:30PM
11,Samuel Beckett's,,VA,Tuesday,7:30PM
12,Trusty's,,DC,Tuesday,8:00PM


In [13]:
df = pd.concat([df_dist,df_king]).query("day==@DAYS")

In [14]:
def clean_name(nm):
    for day in DAYS:
        nm = nm.replace(f'({day})','')
        nm = nm.replace(f'({day}s)','')
    return (nm.replace('RVILL WED','Rockville').strip())


df['clean_name'] = df['name'].apply(clean_name)
df


Unnamed: 0,name,url,state,day,time,clean_name
0,Franklin Hall (Tuesdays),https://www.franklinhalldc.com/,DC,Tuesday,7:00PM,Franklin Hall
1,Stadium Sports Bar,https://www.stadiumsportsdc.com/,DC,Wednesday,7:00PM,Stadium Sports Bar
2,Capitol Cider House,https://capitolciderhouse.com/,DC,Thursday,7:00PM,Capitol Cider House
3,The Eleanor (NoMa),https://www.eleanordc.com/,DC,Sunday,7:30PM,The Eleanor (NoMa)
4,Across the Pond,https://www.acrosstheponddc.com/,DC,Wednesday,7:30PM,Across the Pond
5,Crown & Crow,http://www.thecrownandcrow.com/,DC,Wednesday,7:30PM,Crown & Crow
6,Astro Beer Hall,https://www.astrobeerhall.com,DC,Wednesday,7:00PM,Astro Beer Hall
7,Hook Hall,https://www.hookhall.com/,DC,Thursday,7:00PM,Hook Hall
8,Crimson,https://crimson-dc.com/,DC,Tuesday,7:00PM,Crimson
9,The Fainting Goat,https://www.faintinggoatdc.com/,DC,Wednesday,7:00PM,The Fainting Goat


In [35]:
home = [38.9191, -77.0363]


def get_place(name):
    params = {
        'input': name,
        'inputtype':'textquery',
        'fields':"%2C".join(['formatted_address','geometry']),
        'key':GOOGLE_MAPS_API_KEY,
        'locationbias':f'point:{home[0]},{home[1]}'
    }

    GOOGLE_MAPS_API_URL = 'https://maps.googleapis.com/maps/api/geocode/json'
    PLACES_URL = 'https://maps.googleapis.com/maps/api/place/findplacefromtext/json'

    url = f"https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input=Museum%20of%20Contemporary%20Art%20Australia&inputtype=textquery&fields=formatted_address%2Cname%2Crating%2Copening_hours%2Cgeometry&key={GOOGLE_MAPS_API_KEY}"


    formatted_url = PLACES_URL + '?' + '&'.join(["{}={}".format(k,v) for k,v in params.items()])  # !
    # print(formatted_url)

    payload={}
    headers = {}

    # response = request("GET", url, headers=headers, data=payload)
    response = get(formatted_url,headers=headers, data=payload)
    # print(response.url)
    # print(response.text)
    return json.loads(response.text)


# venues = pd.read_csv('trivia_venues.csv',index_col=0)
# print(len(venues))

# for i in range(len(venues)):
#     params['address']=venues.loc[i,'Address']
#     req = requests.get(GOOGLE_MAPS_API_URL, params=params)
#     res = req.json()
#     result = res['results'][0]
#     lat = result['geometry']['location']['lat']
#     lng = result['geometry']['location']['lng']
#     venues.at[i,'lat']=lat
#     venues.at[i,'lng']=lng
#     print(i)
#     time.sleep(1) if i%50==0 else 1

# print(venues.loc[0:3,:])
# venues.to_csv('trivia_venues_loc.csv')

get_place(df['clean_name'][0])

{'candidates': [{'formatted_address': '1348 Florida Ave NW, Washington, DC 20009, United States',
   'geometry': {'location': {'lat': 38.9200265, 'lng': -77.0312801},
    'viewport': {'northeast': {'lat': 38.92145292989272,
      'lng': -77.02997327010728},
     'southwest': {'lat': 38.91875327010728, 'lng': -77.03267292989271}}}}],
 'status': 'OK'}

In [36]:
place_data = [get_place(nm) for nm in df['clean_name']]

In [41]:
[place for place in place_data if len(place['candidates'])]

[{'candidates': [{'formatted_address': '8150 Baltimore Ave, College Park, MD 20740, United States',
    'geometry': {'location': {'lat': 38.9908676, 'lng': -76.93432849999999},
     'viewport': {'northeast': {'lat': 38.99231942989272,
       'lng': -76.93305917010727},
      'southwest': {'lat': 38.98961977010728, 'lng': -76.9357588298927}}}},
   {'formatted_address': '8180 Maple Lawn Blvd, Fulton, MD 20759, United States',
    'geometry': {'location': {'lat': 39.148425, 'lng': -76.906903},
     'viewport': {'northeast': {'lat': 39.14969662989272,
       'lng': -76.90566587010728},
      'southwest': {'lat': 39.14699697010728, 'lng': -76.90836552989272}}}}],
  'status': 'OK'}]

In [43]:
addresses = [dict(address=place['candidates'][0]['formatted_address'],
    lat=place['candidates'][0]['geometry']['location']['lat'],
    lng=place['candidates'][0]['geometry']['location']['lng']) for place in place_data]

In [44]:
df_addr = pd.DataFrame.from_records(addresses)
df_fin = pd.concat([df.reset_index(drop=True), df_addr], axis=1)

In [47]:
df_fin.query('day=="Wednesday"')

Unnamed: 0,name,url,state,day,time,clean_name,address,lat,lng
1,Stadium Sports Bar,https://www.stadiumsportsdc.com/,DC,Wednesday,7:00PM,Stadium Sports Bar,"300 Tingey St SE, Washington, DC 20003, United...",38.875173,-77.001475
4,Across the Pond,https://www.acrosstheponddc.com/,DC,Wednesday,7:30PM,Across the Pond,"1732 Connecticut Ave NW, Washington, DC 20009,...",38.913633,-77.046017
5,Crown & Crow,http://www.thecrownandcrow.com/,DC,Wednesday,7:30PM,Crown & Crow,"1317 14th St NW, Washington, DC 20005, United ...",38.907862,-77.031566
6,Astro Beer Hall,https://www.astrobeerhall.com,DC,Wednesday,7:00PM,Astro Beer Hall,"1306 G St NW, Washington, DC 20005, United States",38.898153,-77.030278
9,The Fainting Goat,https://www.faintinggoatdc.com/,DC,Wednesday,7:00PM,The Fainting Goat,"1330 U St NW, Washington, DC 20009, United States",38.916773,-77.03084
14,The Big Stick,http://thebigstick.com/,DC,Wednesday,8:00PM,The Big Stick,"20 M St SE, Washington, DC 20003, United States",38.876657,-77.007631
15,Irish Channel (Chinatown),http://www.irishchanneldc.com/,DC,Wednesday,7:00PM,Irish Channel (Chinatown),"500 H St NW, Washington, DC 20001, United States",38.899468,-77.019158
20,Looney's Pub (Wednesday),http://www.looneyspubmd.com/college_park.html,MD,Wednesday,7:00PM,Looney's Pub,"8150 Baltimore Ave, College Park, MD 20740, Un...",38.990868,-76.934328
24,World of Beer (RVILL WED),https://worldofbeer.com/Locations/Rockville,MD,Wednesday,7:00PM,World of Beer (Rockville),"196 E Montgomery Ave #B, Rockville, MD 20850, ...",39.084257,-77.150331
29,Courthaus Social,mailto:courthaussocial@gmail.com?Subject=Distr...,VA,Wednesday,7:00PM,Courthaus Social,"2300 Clarendon Blvd, Arlington, VA 22201, Unit...",38.890194,-77.086535


In [None]:
df_fin.to_csv('trivia_venues_loc.csv')
