In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Step 1 Scrape the info card

![info cards.png](images/info%20cards.png)

The first step of the task is to scrape the info card of each POI, which contains titles, geolocations,
urls to the main article page, etc. Those ugly try and except clauses are a necessary evil,
because not each info card has all the information, so errors may arise during the scraping!

In [2]:
def parse_card(card, data):
    try:
        data['title'].append(card.select_one('div > a > div > h3 > span').text)
    except:
        data['title'].append('')

    try:
        data['country'].append(card.select_one('div > a')['data-country'])
    except:
        data['country'].append()

    try:
        data['city'].append(card.select_one('div > a')['data-city'])
    except:
        data['city'].append('')

    try:
        data['description_short'].append(card.select_one('div > a > div > div.Card__content.js-subtitle-content').text.strip())
    except:
        data['description_short'].append('')

    try:
        data['page_url'].append('www.atlasobscura.com' + card.select_one('div > a')['href'])
    except:
        data['page_url'].append('')

    try:
        data['latitude'].append(card.select_one('div > a')['data-lat'])
    except:
        data['latitude'].append('')

    try:
        data['longitude'].append(card.select_one('div > a')['data-lng'])
    except:
        data['longitude'].append('')

    try:
        data['img_url'].append(card.select_one('div > a > figure > img')['data-src'])
    except:
        data['img_url'].append('')


In [3]:
data = {'title': [],
        'country': [],
        'city': [],
        'description_short': [],
        'page_url': [],
        'latitude': [],
        'longitude': [],
        'img_url': [],
        'page': []}

# Go over page 1-144
for i in range(1, 145):
    url = 'https://www.atlasobscura.com/things-to-do/united-kingdom/places?page={}'.format(i)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    cards = soup.select('.CardWrapper')
    for card in cards:
        parse_card(card, data)
        data['page'].append(i)
    print('Page {} done'.format(i))

    time.sleep(5)

df = pd.DataFrame(data)

# Save the final result
df.to_excel('atlas-obscura-data-iter1.xlsx', index=False)
df.to_excel('atlas-obscura-data-latest.xlsx', index=False)

Page 1 done
Page 2 done
Page 3 done
Page 4 done


# Step 2 Scrape each article

![article.png](images/article.png)

Now that we have got each POI's url to the main article site, we can finally scrape them!
However, as it is gonna be a long journey,
it's important to save the intermediary every now and then.

In [4]:
def scrape_page(url):
    url = 'https://' + url
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    result = {}
    try:
        result['html'] = str(soup)
    except:
        result['html'] = ''
    try:
        result['url-site'] = soup.select_one('#place-container > div.DDPage__content-row.grid-row > div.DDPageSiderail__column.grid-col-lg-4.grid-col-md-5 > div.DDPageSiderail > aside.DDPageSiderail__details > div.DDPageSiderail__website.hidden-print > a')['href']
    except:
        result['url-site'] = ''
    try:
        result['url-google-map'] = soup.select_one('.DDPageSiderail__directions-link')['href']
    except:
        result['url-google-map'] = ''
    try:
        result['article'] = soup.select_one('#place-body').text.strip() + '\n' + soup.select_one('#place-container > div.DDPage__content-row.grid-row > div.DDPageContent__column.grid-col-lg-6.grid-col-md-7 > div.DDP__direction-copy').text.strip()
    except:
        result['article'] = soup.select_one('#place-body').text.strip()
    try:
        result['tags'] = soup.select('.itemTags__link')
        result['tags'] = ', '.join([item.text.strip() for item in result['tags']])
    except:
        result['tags'] = ''

    return result

In [None]:
# You may need to run the result data over the same code again, in case there are missing articles.
df = pd.read_csv('atlas-obscura-data-latest.xlsx')
cols = ['url-site', 'url-google-map', 'article', 'tags', 'html']
for col in cols:
    df[col] = ''

for i, url in enumerate(df['page_url']):
    if df['article'].iloc[i] == '':
        result = scrape_page(url)
        for col in cols:
            df[col].iloc[i] = result[col]
        print('POI {} Done'.format(i))
        time.sleep(2)
    if i % 200 == 0:
        df.to_excel('atlas-obscura-data-iter2-{}.xlsx', index=False)
        df.to_excel('atlas-obscura-data-latest.xlsx', index=False)

# Save the final result
df.to_excel('atlas-obscura-data-iter2-{}.xlsx', index=False)
df.to_excel('atlas-obscura-data-latest.xlsx', index=False)
