In [1]:
from __future__ import print_function, division
import requests
requests.__path__

['/Users/sean/anaconda3/envs/metis/lib/python3.7/site-packages/requests']

Request a page of listing from boats.com:

In [2]:
url = 'https://www.boats.com/boats-for-sale/?boat-type=sail&class=sail-cruiser&length=40-50ft&Page=1'
response = requests.get(url)
response.status_code

200

In [3]:
response.url

'https://www.boats.com/boats-for-sale/?boat-type=sail&class=sail-cruiser&length=40-50ft&page=1'

In [4]:
page = response.text

Load the page content into a BeautifulSoup object.

In [5]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page, "lxml")

In [6]:
#print(soup.prettify())

## Explorations to find the items I want:
Grab all the listings in the list on this page.  Then grab the html list items

In [7]:
all_list_items = soup.find(id='listings-srp').find_all('li')

Iterate through the listings collection to find the actual boat listing items that have this attribute:

In [8]:
print(all_list_items[10].attrs)

{'data-listing-id': '5637501'}


In [9]:
if 'data-listing-id' in all_list_items[10].attrs:
    print(True)

True


In [10]:
all_list_items[10].has_attr('data-listing-id')

True

The hyperlink (a tag) contains BoatID and the URL we need.

In [11]:
all_list_items[10].find('a').attrs

{'href': '/sailing-boats/2020-bali-4-5-5637501/',
 'data-reporting-click-product-id': '5637501',
 'data-reporting-click-listing-type': 'standard listing'}

In [12]:
print(all_list_items[10].find('a')['href'])

/sailing-boats/2020-bali-4-5-5637501/


After the above exploratory work I was able to create the code below to extract boat listing link data.

In [13]:
import pandas as pd
import requests
import sys
import os.path
import pickle
import time
import re
from bs4 import BeautifulSoup

# Initialize variables
# Note this url is missing the page:
url = 'https://www.boats.com/boats-for-sale/?boat-type=sail&class=sail-cruiser&length=40-50ft&page='
# When a page out of range is requested boat.com redirects to this page:
redirect_page = 'https://www.boats.com/boats-for-sale/?boat-type=sail'
# Path for storing the page number last scraped:
pagination_page_path = './data/pagination_page.pickle'
# Path for storing the data listings dataframe:
All_Listings_path = './data/All_Listings.pkl'
# Column definitions for the data listings dataframe:
columns = ['link', 'make_model', 'year', 'price', 'seller', 'location']

# If either of the two data files don't exist start from scratch.
# Otherwise, read ojbects from those files to resume scraping from where it left off.
if (not os.path.isfile(pagination_page_path)) or (not os.path.isfile(All_Listings_path)):
    # One or both data files don't exist so start from scratch (page 1).
    pagination_page = 1
    All_Listings = pd.DataFrame(index=['BoatID'],columns=columns)
    with open(pagination_page_path, 'wb') as f:
        # Pickle the 'pagination_page' using the highest protocol available.
        pickle.dump(pagination_page, f, pickle.HIGHEST_PROTOCOL)
    print('Saved initial boats.com pagination page number: 1')
else:
    # TODO: Should implement a try/catch here in case file is corrupt.
    # Load boats.com pagination page number and previously scraped boat data.
    with open(pagination_page_path, 'rb') as f:
        # The protocol version used is detected automatically, so we do not
        # have to specify it.
        pagination_page = pickle.load(f)
    All_Listings = pd.read_pickle(All_Listings_path)
    print('Read current boats.com pagination page:', pagination_page, 
          'Starting with:', len(All_Listings), 'boats.')
# Build/Complete the source url
url = url + str(pagination_page)

# Iterate over all pages of boat listings to pull needed information:
run_loop = True
while run_loop:
    # Get webpage content
    response = requests.get(url)
    if response.status_code != 200:
        sys.exit(response.status_code, ' Error for:', url)
    if response.url == redirect_page:  #This occurs when you ask for a page out of range.
        run_loop = False
        print('Boat.com redirected to all sailboats!  Delete data files to restart.')
        break
    page = response.text
    # Engage BeautifulSoup and a selected parser
    soup = BeautifulSoup(page, "lxml")

    # Extract the part of the DOM that contains the list of boats
    all_list_items = soup.find(id='listings-srp').find_all('li')
    # Loop through the list items to find the actual listings
    # Other items are advertisements and new boats
    for list_item in all_list_items:
        # Actual listing contain this attribute, others are advertisements
        if list_item.has_attr('data-listing-id'):
            # Extract the pieces of information contained in each list item
            BoatID = list_item['data-listing-id']
            # Duplicate listings are possible
            if BoatID in All_Listings.index.values:
                break
            link = list_item.find('a')['href']
            make_model = list_item.find('h2').string.strip()
            year = list_item.find(class_='year').string.strip()
            price = list_item.find(class_='price').text.strip().replace('$','')\
                .replace(',','').replace('\n','').replace('(Sale Pending)','')
            seller = list_item.find(class_='seller').text.replace('Seller ','').strip()
            location = list_item.find(class_='country').text.strip().title()
            new_boat = pd.DataFrame([[link, make_model, year, price, seller, location]],\
                                    columns=columns,index=[BoatID])
            All_Listings = All_Listings.append(new_boat)

    # Save the listings and the next page index in case something happens:
    All_Listings.to_pickle(All_Listings_path)
    pagination_page += 1
    with open(pagination_page_path, 'wb') as f:
        # Pickle the 'pagination_page' using the highest protocol available.
        pickle.dump(pagination_page, f, pickle.HIGHEST_PROTOCOL)
    report = 'Obtained and processed Boats.com page '+\
             str(int(pagination_page)-1).strip()+' and stored data: '+\
             str(len(All_Listings))+' boats scraped.\r'
    sys.stdout.write(report)
    sys.stdout.flush()
    url = re.sub(r'page=[0-9]+','page='+str(pagination_page),url)
    time.sleep(2)

All_Listings

Read current boats.com pagination page: 204 Starting with: 3021 boats.
Boat.com redirected to all sailboats!  Delete data files to restart.


Unnamed: 0,link,make_model,year,price,seller,location
BoatID,,,,,,
5637501,/sailing-boats/2020-bali-4-5-5637501/,Bali 4.5,2020,Request Price,"Cruising Yachts Unlimited, Inc","All Locations, California"
7213466,/sailing-boats/1996-beneteau-oc-400-7213466/,Beneteau OC 400,1996,89500,Sail Place Inc Kenosha Office,"Kenosha , Wisconsin"
7216014,/sailing-boats/2002-hunter-466-7216014/,Hunter 466,2002,125900,"Latitude Yacht Brokerage, LLC","Stamford, Connecticut"
7191853,/sailing-boats/2018-jeanneau-sun-odyssey-419-7...,Jeanneau Sun Odyssey 419,2018,265000,Bluenose Yacht Sales- Newport,"Portsmouth, Rhode Island"
...,...,...,...,...,...,...
601866,/sailing-boats/2002-beneteau-first-47-7-601866/,Beneteau First 47.7,2002,148085,Giulio Cesare Giua,"Roma, Italy"
3106946,/sailing-boats/2009-dufour-425-3106946/,Dufour 425,2009,108596,Star Cruising,"Toscana, Italy"
3275826,/sailing-boats/1987-custom-ketch-50-3275826/,Custom Ketch 50,1987,245643,Dickies of Bangor,United Kingdom
915071,/sailing-boats/2004-beneteau-oceanis-clipper-4...,Beneteau Oceanis Clipper 423,2004,193059,CCE Yachting,"Sardinia Island, Italy"


In [14]:
All_Listings.describe()

Unnamed: 0,link,make_model,year,price,seller,location
count,3020,3020,3020,3020,3020,3020
unique,3020,1405,80,1089,627,1075
top,/sailing-boats/1988-contest-48-5840617/,Bavaria Cruiser 46,2008,Request Price,Selymar Yachts,Spain
freq,1,49,171,90,85,72


Note: all links are now unique. (above)  I have 3020 boat links.  There are no duplicates:

In [15]:
All_Listings[All_Listings.duplicated()]

Unnamed: 0,link,make_model,year,price,seller,location


In [16]:
All_Listings[All_Listings.index.duplicated()]

Unnamed: 0,link,make_model,year,price,seller,location


In [17]:
All_Listings.tail()

Unnamed: 0,link,make_model,year,price,seller,location
601866,/sailing-boats/2002-beneteau-first-47-7-601866/,Beneteau First 47.7,2002,148085,Giulio Cesare Giua,"Roma, Italy"
3106946,/sailing-boats/2009-dufour-425-3106946/,Dufour 425,2009,108596,Star Cruising,"Toscana, Italy"
3275826,/sailing-boats/1987-custom-ketch-50-3275826/,Custom Ketch 50,1987,245643,Dickies of Bangor,United Kingdom
915071,/sailing-boats/2004-beneteau-oceanis-clipper-4...,Beneteau Oceanis Clipper 423,2004,193059,CCE Yachting,"Sardinia Island, Italy"
1015053,/sailing-boats/1973-nautor-swan-sparkman-steph...,Nautor Swan Sparkman & Stephens 44/039,1973,125000,"Sparkman & Stephens, LLC","Mamaroneck, New York"


In [18]:
response.url

'https://www.boats.com/boats-for-sale/?boat-type=sail'