In [1]:
import csv
import os
import urllib.request as u
import string as s
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
src = '../../../data/src/beer-mapping-updated/'

In [3]:
def xml2df(xml_data):
    root = ET.XML(xml_data) # element tree
    all_records = []
    for i, child in enumerate(root):
        record = {}
        for subchild in child:
            record[subchild.tag] = subchild.text
            all_records.append(record)
    return pd.DataFrame(all_records)

## Extract Breweries by State

In [4]:
API_KEY = '39e5e12ab391034227fa825c0e93e4b5'
url_request_start = 'http://beermapping.com/webservice/locstate/'
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

for i, state in enumerate(states):
    url = url_request_start+API_KEY+'/'+state
    request = u.Request(url)
    response_raw = str(u.urlopen(request).read())
    response_formatted = response_raw[response_raw.find('<bmp_locations>'):-1].replace('&', 'and')
    xml_data = response_formatted
    if i==0:
        df_parsed = xml2df(xml_data)
    else:
        df_parsed = df_parsed.append(xml2df(xml_data))

df_parsed = df_parsed.drop_duplicates()
df_parsed = df_parsed.reset_index()
df_parsed = df_parsed.drop('index', 1)   

with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(df_parsed.head(n=5))

                                             blogmap  ...      zip
0  http://beermapping.com/maps/blogproxy.php?loci...  ...    35801
1  http://beermapping.com/maps/blogproxy.php?loci...  ...    35124
2  http://beermapping.com/maps/blogproxy.php?loci...  ...    35401
3  http://beermapping.com/maps/blogproxy.php?loci...  ...    35222
4  http://beermapping.com/maps/blogproxy.php?loci...  ...    35901

[5 rows x 14 columns]


## Export to CSV

In [5]:
# save data to csv file
fname = os.path.abspath(os.path.join(src, 'beermapping_breweries.csv'))
df_parsed.to_csv(fname, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8', index=False)

## Extract Image, Score and Map Data

In [6]:
# map info: http://beermapping.com/webservice/locmap/API_KEY/ID
# score info: http://beermapping.com/webservice/locscore/API_KEY/ID
# image info: http://beermapping.com/webservice/locimage/API_KEY/ID

map_url = 'http://beermapping.com/webservice/locmap/' + API_KEY + '/'
score_url = 'http://beermapping.com/webservice/locscore/' + API_KEY + '/'
image_url = 'http://beermapping.com/webservice/locimage/' + API_KEY + '/'

iter_tups = [(map_url, 'map'),(score_url, 'score'),(image_url, 'image')]
df_dict={}

for url, dtype in iter_tups:
    for index, row in df_parsed.iterrows():
        if row['id']:
            id_ = row['id']
        else:
            continue
        url_ = url + id_
        insert_id = '<id>' + str(id_) + '</id>'
        try:
            request = u.Request(url_)
            response_raw = str(u.urlopen(request).read())
        except:
            continue
        if index==0:
            response_formatted = response_raw[response_raw.find('<bmp_locations>'):
                                              response_raw.find('</bmp_locations>')].replace('&', 'and')
            response_formatted = response_formatted[:response_formatted.find('</location>')] + insert_id +'</location>'
        else:
            insert_s = response_raw[response_raw.find('<bmp_locations>'):
                                              response_raw.find('</bmp_locations>')].replace(
                '&', 'and').replace('<bmp_locations>','')
            if insert_s=='' or not insert_s:
                continue
            insert_s = insert_s[:insert_s.find('</location>')] + insert_id +'</location>'
            response_formatted += insert_s
    xml_data = response_formatted + '</bmp_locations>'
    print(xml_data)
    df_dict[dtype] = xml2df(xml_data)
    df_dict[dtype] = df_dict[dtype].drop_duplicates()
    print('Finished extracting ' + dtype + ' object.')
    print(df_dict[dtype].head(n=5))
    

<bmp_locations><location><name>1892 East Tavern</name><status>Beer Bar</status><lat>34.738449</lat><lng>-86.575702</lng><map></map><altmap></altmap><id>15896</id></location><location><name>AlaBrew</name><status>Homebrew</status><lat>33.333988</lat><lng>-86.782765</lng><map></map><altmap></altmap><id>8397</id></location><location><name>Alcove International Tavern</name><status>Beer Bar</status><lat>33.207921</lat><lng>-87.564704</lng><map></map><altmap></altmap><id>12810</id></location><location><name></name><status></status><lat>0.000000</lat><lng>0.000000</lng><map></map><altmap></altmap><id>15706</id></location><location><name></name><status></status><lat>0.000000</lat><lng>0.000000</lng><map></map><altmap></altmap><id>15004</id></location><location><name></name><status></status><lat>0.000000</lat><lng>0.000000</lng><map></map><altmap></altmap><id>20646</id></location><location><name></name><status></status><lat>0.000000</lat><lng>0.000000</lng><map></map><altmap></altmap><id>17056</

Finished extracting map object.
   altmap     id        lat         lng   map                         name  \
0    None  15896  34.738449  -86.575702  None             1892 East Tavern   
7    None   8397  33.333988  -86.782765  None                      AlaBrew   
14   None  12810  33.207921  -87.564704  None  Alcove International Tavern   
21   None  15706   0.000000    0.000000  None                         None   
28   None  15004   0.000000    0.000000  None                         None   

      status  
0   Beer Bar  
7   Homebrew  
14  Beer Bar  
21      None  
28      None  
<bmp_locations><location><overall>0</overall><selection>0</selection><service>0</service><atmosphere>0</atmosphere><food>0</food><reviewcount>0</reviewcount><fbscore></fbscore><fbcount></fbcount><id>15896</id></location><location><overall>0</overall><selection>0</selection><service>0</service><atmosphere>0</atmosphere><food>0</food><reviewcount>0</reviewcount><fbscore></fbscore><fbcount></fbcount><id>8397<

Finished extracting score object.
   atmosphere fbcount fbscore food     id overall reviewcount selection  \
0           0    None    None    0  15896       0           0         0   
9           0    None    None    0   8397       0           0         0   
18          0    None    None    0  12810       0           0         0   
27          0    None    None    0  15706       0           0         0   
36          0    None    None    0  15004       0           0         0   

   service  
0        0  
9        0  
18       0  
27       0  
36       0  


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


Finished extracting image object.
   caption credit crediturl directurl height     id imagedate imageid  \
0     None   None      None      None   None  15896      None    None   
12    None   None      None      None   None   8397      None    None   
24    None   None      None      None   None  12810      None    None   
36    None   None      None      None   None  15706      None    None   
48    None   None      None      None   None  15004      None    None   

   imageurl score thumburl width  
0      None  None     None  None  
12     None  None     None  None  
24     None  None     None  None  
36     None  None     None  None  
48     None  None     None  None  


## Export to CSV

In [7]:
for url, dtype in iter_tups:
    fname = os.path.abspath(os.path.join(src, '%s.csv' % dtype))
    df_dict[dtype].to_csv(fname, 
                          quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8', index=False)