# Boat Information Scraping
This notebook takes the results of the Scraping_Listing_Links notebook which is the pickled All_Listings dataframe and scrapes the list of webpages in the link column.  A new dataframe will capture the information detail for all the boats.

[This](https://www.boats.com/sailing-boats/1996-beneteau-oc-400-7213466/) is a sample boat details page.  If that link is stale follow any USED boat listing from [this](https://www.boats.com/boats-for-sale/?boat-type=sail&class=sail-cruiser&length=40-50ft) page.

In [151]:
import pandas as pd
import requests
import sys
import os.path
import pickle
import time
import re
from bs4 import BeautifulSoup
from bs4 import NavigableString

Read in the dataframe with the list of boat links:

In [2]:
All_Listings_path = './data/All_Listings.pkl'
All_Listings = pd.read_pickle(All_Listings_path)
# I'm noticing that the first listing entry has the index BoatID
# and is otherwise empty.  I'll just drop it.
All_Listings.drop(['BoatID'],inplace=True)
All_Listings.describe()

Unnamed: 0,link,make_model,year,price,seller,location
count,3020,3020,3020,3020,3020,3020
unique,3020,1405,80,1089,627,1075
top,/sailing-boats/2020-nautitech-46-open-7078170/,Bavaria Cruiser 46,2008,Request Price,Selymar Yachts,Spain
freq,1,49,171,90,85,72


In [347]:
# Initialize variables
# Location of the dataframe pickle file containing all the boat data:
boats_df_path = './data/boats_df.pkl'
# Location of the dataframe pickle file containing all the boat 'features' data:
boats_features_df_path = './data/boats_features_df.pkl'
# Location of the file containing the links remaining to scrape:
links_to_scrape_path = './data/links_to_scrape.pkl'
# Location of the file containing failed scrapings:
fails_path = './data/fails.pkl'
# Base part of the url that when df[link] is concatinated become the boat url:
url_root = 'https://www.boats.com'
# Columns of the boats_df dataframe:
boat_columns = ['price', 'make', 'model', 'year', 'cls', 'length_ft',\
                'LOA_ft', 'displacement_lb', 'fuel_type', 'power_hp',\
                'eng_make', 'eng_model','eng_year','eng_hrs', 'location']
feat_columns = ['electronics','inside equipment','outside equipment',\
                'rigging', 'sails']
fails_columns = ['path']

In [362]:
def get_links_to_scrape(Restart=False):
    if not os.path.isfile(links_to_scrape_path) or Restart:
        df = All_Listings.copy()
        txt = 'The list of links to scrape was initialized: '
    else:
        df = pd.read_pickle(links_to_scrape_path)
        txt = 'The list of links remaining to scrape was read:'
    print(txt, len(df),' boats.')
    return df

def get_boats_df(Restart=False):
    if not os.path.isfile(boats_df_path) or Restart:
        df = pd.DataFrame(columns=boat_columns)
        print('boats_df was initialized: 0 boats.')
    else:
        df = pd.read_pickle(boats_df_path)
        print(len(df),' boats read from file into boats_df.')
    return df

def get_boats_features_df(Restart=False):
    if not os.path.isfile(boats_features_df_path) or Restart:
        df = pd.DataFrame(columns=feat_columns)
        print('boats_features_df was initialized: 0 boats.')
    else:
        df = pd.read_pickle(boats_features_df_path)
        print(len(df),' boats features read from file into boats_features_df.')
    return df

def get_fails_df(Restart=False):
    if not os.path.isfile(fails_path) or Restart:
        df = pd.DataFrame(columns=fails_columns)
        print('failed_df was initialized: 0 boats.')
    else:
        df = pd.read_pickle(fails_path)
        print(len(df),' failed scrapings read from file into fails_df.')
    return df
    

def find_detail(section,data_id_strs):
    return_list = []
    for item in data_id_strs:
        data_row = section.find('th', text=item)
        if data_row is None:
            return_list.append(None)
        else:
            data_row = data_row.parent.find('td').contents[0]
            if isinstance(data_row, NavigableString):
                data_row = str(data_row)
            return_list.append(data_row)
    if len(return_list) == 1:
        return_list = return_list[0]
    return return_list

def find_list(section,data_id_strs):
    return_list = []
    for item in data_id_strs:
        data_row = section.find('th', text=item)
        if data_row is None:
            return_list.append(None)
        else:
            data_row = data_row.parent.find('td').contents[0]
            return_list.append(data_row)
    if len(return_list) == 1:
        return_list = return_list[0]
    return return_list

def scrape_listing(listing):
    boatID = listing.name
    url = url_root + listing.link
    response = requests.get(url)
    # Catch error response codes <> 200:
    if response.status_code != 200:
        sys.exit(response.status_code + ' Error for:' + url)
        return [],[],[]
    if response.url != url:
        print('BoatID ',boatID, 'link was redirected.')
        return [],[],[]
    page = response.text
    # Engage BeautifulSoup and a selected parser
    soup = BeautifulSoup(page, "lxml")
    # Get info from the 'Boat Details' section
    details = soup.find(class_='accordion-content')
    make, model, year, price, cls, length, fuel_type, location = \
        find_detail(details,['Make','Model','Year','Price','Class',\
                             'Length','Fuel Type','Location'])
    # Clean up price and length
    if price is not None: price = re.sub(r'[US$,]','',str(price))
    if length is not None: length = length.replace('ft','').strip()

    #Get info from the 'Measurements' section and clean off expected units:
    measurements = soup.find(id='measurements')
    if measurements is None:
        LOA, displacement = None, None
    else:
        LOA, displacement = find_detail(measurements,['LOA',re.compile("Displacement+")])
    if displacement is not None: displacement = displacement.replace('lb','').strip()

    # Get info from the 'Propulsion' section:
    propulsion = soup.find(id='propulsion')
    if propulsion is None:
        eng_make, eng_model, eng_year, pwr, eng_hrs = None, None, None, None, None
    else:
        eng_make, eng_model, eng_year, pwr, eng_hrs = find_detail(propulsion,\
                            ['Engine Make', 'Engine Model','Engine Year','Power',\
                                                     'Engine usage (hours)'])
    if pwr is not None: pwr = pwr.replace('hp','').strip()

    # Get info from the 'Features' section:
    features = soup.find(id='features')
    if features is None:
        electronics, inside_equip, outside_equip, rigging, sails = [[]]*5
    else:
        electronics, inside_equip, outside_equip, rigging, sails = \
            find_list(features,['Electronics','Inside Equipment',\
                                  'Outside Equipment/Extras','Rigging','Sails'])
        if electronics is not None:
            electronics = [str(i.contents[0]) for i in electronics.parent.find_all('li')]
        if inside_equip is not None:
            inside_equip = [str(i.contents[0]) for i in inside_equip.parent.find_all('li')]
        if outside_equip is not None:
            outside_equip = [str(i.contents[0]) for i in outside_equip.parent.find_all('li')]
        if rigging is not None:
            rigging = [str(i.contents[0]) for i in rigging.parent.find_all('li')]
        if sails is not None:
            sails = [str(i.contents[0]) for i in sails.parent.find_all('li')]

    description = soup.find(class_='desc-text')
    new_boat = pd.DataFrame([[price, make, model, year, cls, length, LOA, \
                              displacement, fuel_type, pwr, \
                              eng_make, eng_model, eng_year, eng_hrs,\
                              location]],columns=boat_columns,index=[boatID])
    new_feat = pd.DataFrame([[electronics, inside_equip, outside_equip, rigging,\
                              sails]],columns=feat_columns,index=[boatID])
    return new_boat, new_feat, description

def save_data(list_to_save):
    for item in list_to_save:
        item[0].to_pickle(item[1])
    return

In [363]:
# Start at the beginning or where we left off:
Restart = False
fails_df = get_fails_df(Restart)
boats_df = get_boats_df(Restart)
boats_features_df = get_boats_features_df(Restart)
links_to_scrape = get_links_to_scrape(Restart)
boats_scraped_since_last_pickle = 0

while len(links_to_scrape)>0:
    listing = links_to_scrape.iloc[0]
    new_boat, new_feat, description = scrape_listing(listing)
    if isinstance(new_boat, pd.DataFrame):
        boats_df = boats_df.append(new_boat)
        boats_features_df = boats_features_df.append(new_feat)
        report = 'Obtained and processed '
        if new_boat.year[0] is not None: report += new_boat.year[0]
        if new_boat.make[0] is not None: report += ' ' + new_boat.make[0]
        if new_boat.model[0] is not None: report += ' ' + new_boat.model[0]
        report += '.  Total of ' + str(len(boats_df)) + ' boat listings scraped.\r'
        sys.stdout.write(" "*1000 + '\r')
        sys.stdout.write(report)
        sys.stdout.flush()
        boats_scraped_since_last_pickle += 1
    else:
        fails_df = fails_df.append(pd.DataFrame([[listing.link]],
                                                columns=fails_columns,
                                                index=[listing.name]),
                                  sort=False)
    links_to_scrape.drop([listing.name],inplace=True)
    if boats_scraped_since_last_pickle > 9:
        save_data([[boats_df,boats_df_path],\
                   [boats_features_df,boats_features_df_path],\
                   [links_to_scrape,links_to_scrape_path],\
                   [fails_df,fails_path]])
        boats_scraped_since_last_pickle = 0
    time.sleep(5)
print('Complete')

1  failed scrapings read from file into fails_df.
124  boats read from file into boats_df.
124  boats features read from file into boats_features_df.
The list of links remaining to scrape was read: 2895  boats.
BoatID  7233077 link was redirected. Oceanis Clipper 423.  Total of 205 boat listings scraped.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

BoatID  6935340 link was redirected.30.  Total of 1050 boat listings scraped.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

BoatID  7236991 link was redirected. Sun Odyssey 45.  Total of 1908 boat listings scraped.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [361]:
listing

link          /sailing-boats/2012-southerly-42-rst-7186391/
make_model                                 Southerly 42 RST
year                                                   2012
price                                                401113
seller                                    Clipper Marine HQ
location            Chichester, West Sussex, United Kingdom
Name: 7186391, dtype: object

In [364]:
len(boats_df)

3001

In [365]:
len(boats_features_df)

3001

In [366]:
len(links_to_scrape)

0

In [369]:
boats_df.tail(10)

Unnamed: 0,price,make,model,year,cls,length_ft,LOA_ft,displacement_lb,fuel_type,power_hp,eng_make,eng_model,eng_year,eng_hrs,location
588501,€169000,Salona,45,2004,Cruiser (Sail),45,45 ft,,Diesel,,Volvo,,,,Spain
912268,€95000,Beneteau,Oceanis 411,1998,Cruiser (Sail),40,40 ft 6 in,,Diesel,50.0,Volvo,,1998.0,400.0,"Denia, Spain"
3113317,€205000,Wauquiez,Centurion 45,2004,Cruiser (Sail),46,45 ft 11 in,10500 kg,Diesel,75.0,Yanmar,,2004.0,2300.0,"Central Tyrrenian Sea, Italy"
678265,29000,Islander,P 40,1983,Cruiser (Sail),40,40 ft,,Diesel,,Pathfinder,,,,"Denison/Lake Texoma, Texas"
601838,€110000,Beneteau,First 44.7,2005,Racer,45,44 ft 11 in,9120 kg,Diesel,54.0,Yanmar,Yanmar,2005.0,,"Roma, Italy"
601866,€135000,Beneteau,First 47.7,2002,Cruiser/Racer,49,48 ft 7 in,11500 kg,Diesel,75.0,YANMAR,4JH3-TE,2002.0,,"Roma, Italy"
3106946,€99000,Dufour,425,2009,Cruiser (Sail),42,42 ft 4 in,8500 kg,Diesel,,Volvo,55 hp,2008.0,,"Toscana, Italy"
3275826,£199000,Custom,Ketch 50,1987,Cruiser (Sail),50,50 ft,,Diesel,,"1 X 85.0 DeutzDT4.29, Diesel",,,60.0,United Kingdom
915071,€176000,Beneteau,Oceanis Clipper 423,2004,Cruiser (Sail),42,41 ft 12 in,,Diesel,,,,,,"SARDINIA ISLAND, Italy"
1015053,125000,Nautor Swan,Sparkman & Stephens 44/039,1973,Cruiser (Sail),44,44 ft,28000,Diesel,55.0,Yanmar,4JH3E,2001.0,,"Mamaroneck, New York"


In [370]:
boats_features_df.tail(10)

Unnamed: 0,electronics,inside equipment,outside equipment,rigging,sails
588501,[],[],[],[],[]
912268,"[Depthsounder, Radar, TV set, Plotter, DVD pla...","[Hot water, Refrigerator]",,[Steering wheel],"[Spinnaker, Furling mainsail, Furling genoa]"
3113317,"[Depthsounder, Radar, Log-speedometer, Wind sp...","[Bow thruster, Electric bilge pump, Oven, Manu...","[Teak cockpit, Cockpit shower, Teak sidedecks,...","[Steering wheel, Electric winch, Spinnaker pole]","[Fully battened mainsail, Spinnaker, Gennaker/..."
678265,[],[],[],[],[]
601838,"[Depthsounder, Log-speedometer, Wind speed and...","[Stern thruster, Electric bilge pump, Oven, Ma...","[Teak cockpit, Cockpit shower, Outboard engine...","[Steering wheel, Spinnaker pole]","[Fully battened mainsail, Storm jib, Battened ..."
601866,"[Depthsounder, Radar, Log-speedometer, Wind sp...","[Bow thruster, Electric bilge pump, Oven, Manu...","[Teak cockpit, Cockpit shower, Teak sidedecks,...","[Steering wheel, Electric winch]","[Fully battened mainsail, Gennaker/Cruising sp..."
3106946,"[Depthsounder, Log-speedometer, Wind speed and...","[Electric bilge pump, Oven, Manual bilge pump,...","[Teak cockpit, Cockpit shower, Outboard engine...",[Steering wheel],"[Fully battened mainsail, Furling genoa]"
3275826,[],[],[],[],[]
915071,[],[],[],[],[]
1015053,"[Depthsounder, Radar, Log-speedometer, Wind sp...","[Electric bilge pump, Oven, Manual bilge pump,...","[Teak cockpit, Teak sidedecks, Cockpit cushions]","[Steering wheel, Spinnaker pole]","[Fully battened mainsail, Storm jib, Battened ..."


In [371]:
fails_df

Unnamed: 0,path
5976749,/sailing-boats/1989-hans-christian-41t-5976749/
7233077,/sailing-boats/2008-grand-soleil-cantiere-del-...
7170010,/sailing-boats/2008-jeanneau-sun-odyssey-42-ds...
7142064,/sailing-boats/1987-belliure-40-7142064/
7137736,/sailing-boats/2002-catalina-42-mkii-7137736/
7130989,/sailing-boats/2007-dufour-425-grand-large-713...
7007101,/sailing-boats/2007-jeanneau-sun-odyssey-45-70...
6996130,/sailing-boats/2002-fountaine-pajot-belize-43-...
6935340,/sailing-boats/1994-island-packet-40-6935340/
6828202,/sailing-boats/2007-dufour-485-grand-large-682...
