# Atlas Obscura scraper

In [72]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import json
import time
import re
import pickle
import os
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from collections import Counter

### getting links to crawl through

the following cell is marked as "raw" so it won't run with the notebook, since it's already scraped. To "turn it back on", select and press **Y**

In [10]:
# to open them later:
with open('data/listoflocations', 'rb') as infile:
    listcopy = pickle.load(infile)
    
print(listcopy[0:3])

['/things-to-do/australia/places', '/things-to-do/canada/places', '/things-to-do/china/places']


### crawling through each location page

now that we have all the links for locations directories from Atlas Obscura, we can crawl those to get the links for each individual location.

In [153]:
# function to crawl a location and get links to destinations
def crawlao(aourl):
    links = []
    pagecount = 1
    hitend = False
    while hitend is False:
        page = requests.get('https://www.atlasobscura.com' + aourl + '?page=' + str(pagecount))
        bspage = bs(page.text, 'html.parser')
        bslocs = bspage.find_all('a', 'content-card content-card-place')
        linktmp = [x.get('href') for x in bslocs]
        if len(linktmp) < 1:
            hitend = True
            print('we have reached the end')
        else:
            links += linktmp
            print(str(pagecount), sep=' ', end=' ', flush=True)
            #print('scraping page ' + str(pagecount) + ' of ' + aourl)
            pagecount += 1
    return links

Again, we're turning the cell "off" by turning it into raw - we don't want to scrape all those pages multiple times!

In [20]:
# load saved list:
with open('data/masterlinks', 'rb') as infile:
    aolinks = pickle.load(infile)
    
aolinks.keys()

# example way to subset the links by country, so we can quickly see what the data structure looks like
sublinks = {k: aolinks[k] for k in ('ghana', 'kenya')}
sublinks

{'ghana': ['/places/kakum-canopy-walk',
  '/places/sacred-crocodile-ponds-of-paga',
  '/places/mole-national-park',
  '/places/cape-coast-castle',
  '/places/tengzug-shrine',
  '/places/larabanga-mosque',
  '/places/ga-adangbe-caskets',
  '/places/agbogbloshie'],
 'kenya': ['/places/gedi-ruins',
  '/places/giraffe-manor',
  '/places/giraffe-centre',
  '/places/kitum-cave',
  '/places/marafa-depression',
  '/places/mara-river-crossing',
  '/places/carnivore-restaurant-nairobi',
  '/places/kitengela',
  '/places/donkeys-of-lamu-island',
  '/places/nairobi-railway-museum',
  '/places/george-adamson-s-grave',
  '/places/iten-home-of-champions']}

In [14]:
# count the total number of links I'll have to scrape
# len(value) for key, value in d.items()
sum([len(x) for _,x in aolinks.items()])

16491

### scrape all the links!

In [26]:
# create the directory where we'll save the scraped pages

if os.path.exists('jsons') == False: os.makedirs('jsons')

'''errors with scraping are pretty common,
this is a simple checkpointing in case it gets messed up
'''

if os.path.isfile('data/AOmasterdata'):
    AOdata = pickle.load(open('data/AOmasterdata', 'rb'))  
else:
    AOdata = pd.DataFrame()


In [28]:
# set scraping session parameters
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

CPU times: user 170 µs, sys: 1 µs, total: 171 µs
Wall time: 175 µs


do some cleaning up due to the multiple merging of tables...

In [40]:
AOdata.reset_index(drop = True, inplace = True)

In [67]:
print('duplicated ids: ' + str(sum(AOdata.duplicated('id'))))
print('duplicated titles: ' + str(sum(AOdata.duplicated('title'))))

'''
There are several entries with duplicated titles that are NOT duplicated;
for example, there are muliple "Hidden Beach" in different cities.
Important to subset by "id", not "title".
'''

#AOdata[AOdata['title'] == 'Soumaya Museum' ]

# dupnamesnotid = list(AOdata[AOdata.duplicated('title') & ~AOdata.duplicated('id') ]['title'])

# for i in dupnamesnotid:
#     print(AOdata[AOdata['title'] == i])

#AOdata[AOdata['title'] == 'Titanic Memorial']

duplicated ids: 2118
duplicated titles: 2141


'\nThere are several entries with duplicated titles that are NOT duplicated;\nfor example, there are muliple "Hidden Beach" in different cities.\nImportant to subset by "id", not "title".\n'

In [193]:
AOdata_clean = AOdata.drop_duplicates('id').reset_index(drop=True)

AOdata_clean['keywords'] = AOdata_clean['keywords'].replace('(,section-Atlas)|(section-Atlas)', '', regex=True)

print("raw table: " + str(len(AOdata)))
print("clean table: " + str(len(AOdata_clean)))

# with open('AOmasterdata', 'wb') as outfile:
#     pickle.dump(AOdata_clean, outfile)

raw table: 16233
clean table: 14115


In [101]:
with open('data/AOmasterdata-full-week3', 'wb') as outfile:
     pickle.dump(AOdata, outfile)

with open('data/AOmasterdata-nodup-week3', 'wb') as outfile:
     pickle.dump(AOdata_clean, outfile)

In [102]:
megastring = ",".join(AOdata_clean['keywords'].tolist())

keys = megastring.split(",")

keystable = pd.DataFrame(Counter(keys), index=['count']).T.reset_index().rename(columns={'index':'keyword'})

keystable = keystable.sort_values(by='count', ascending=False).reset_index(drop=True)

print(keystable.head())

keystable.to_csv('data/keywords.csv')

                   keyword  count
0   architectural oddities   1407
1                  museums   1385
2  museums and collections   1201
3                      art    915
4                  history    912


In [127]:
AOdata_clean[AOdata_clean['id'] == 7898]

Unnamed: 0,id,title,subtitle,city,country,location,url,physical_status,lat,lng,keywords,description
6282,7898,Museum de Gevangenpoort (The Prison Gate Museum),A seven-century-old jail tells a not-so-pleasa...,The Hague,Netherlands,"The Hague, Netherlands",https://www.atlasobscura.com/places/the-prison...,,52.079508,4.310292,"prisons,museums,museums and collections,tortur...",Ironically (or appropriately) situated in the ...


In [145]:
# Pull JPG URL from files

img_url = []

aofile = [x for x in os.listdir('jsons') if x.endswith(".html")]


for entry in aofile:
    with open('jsons/'+entry, 'r') as infile:
        aohtml_pre = infile.read()

    aohtml = bs(aohtml_pre, 'html.parser')

    # get JSON info
    summaryinfo = aohtml.find('script', string=re.compile('AtlasObscura.current_place')).text
    # remove trailing garbage
    summaryinfo = re.sub('(\s+)|(;)|(AtlasObscura.current_place = )', ' ', summaryinfo)
    json_summaryinfo = json.loads(summaryinfo)

    img_url.append({'id': json_summaryinfo['id'], 
              'imgurl': aohtml.find('a',{"class": "js-trigger-lightbox gallery-image-container"})['data-lightbox-src']})
    
    # save file with the id as its new name
    with open('jsons_byid/'+str(json_summaryinfo['id'])+'.html', "w") as file:
        file.write(str(aohtml_pre))

img_url_df = pd.DataFrame(img_url)

In [151]:
# check if all values are found in the main dataset
set(img_url_df['id']).issubset(AOdata_clean['id'])

True

In [199]:
# making sure the merge is successful:
print(pd.merge(AOdata_clean, img_url_df, on='id', how='left', indicator=True).groupby('_merge').count()['id'])

AOdata_clean = pd.merge(AOdata_clean, img_url_df, on='id', how='left')

AOdata_clean.head(3)

_merge
left_only         0
right_only        0
both          14115
Name: id, dtype: int64


Unnamed: 0,id,title,subtitle,city,country,location,url,physical_status,lat,lng,keywords,description,imgurl_x,imgurl_y
0,5561,Lake Hillier,An Australian lake whose pink hue defies scien...,,Australia,Australia,https://www.atlasobscura.com/places/lake-hillier,,-34.094179,123.203276,"wonders of salt,natural wonders,watery wonders","From a distance, Lake Hillier of Australia’s R...",https://assets.atlasobscura.com/media/W1siZiIs...,https://assets.atlasobscura.com/media/W1siZiIs...
1,4290,Gippsland Lakes Bioluminescence,Australian Lake set aglow by Noctiluca Scintil...,Raymond Island,Australia,"Raymond Island, Australia",https://www.atlasobscura.com/places/lake-gipps...,,-37.922431,147.791342,"watery wonders,bioluminescence","The conditions were rare, and they were perfec...",https://assets.atlasobscura.com/media/W1siZiIs...,https://assets.atlasobscura.com/media/W1siZiIs...
2,596,The Haunted Bookshop,"Occult books, Tarot, Oddities.",Melbourne,Australia,"Melbourne, Australia",https://www.atlasobscura.com/places/haunted-bo...,,-37.815472,144.961689,"repositories of knowledge,occult,bookstores",The creepy Haunted Bookshop in Melbourne is lo...,https://assets.atlasobscura.com/media/W1siZiIs...,https://assets.atlasobscura.com/media/W1siZiIs...


In [198]:
with open('data/AOmasterdata-nodup-week3', 'wb') as outfile:
     pickle.dump(AOdata_clean, outfile)