In [None]:
# If last run crashed, uncomment following line
#driver.close()

# Load necessary libraries
import time
import dateutil.parser # No longer needed if dates won't be parsed on the fly 
import pandas as pd
from ast import literal_eval
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
#chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
chrome_prefs = {}
chrome_options.experimental_options["prefs"] = chrome_prefs
chrome_prefs["profile.default_content_settings"] = {"images": 2}
chrome_prefs["profile.managed_default_content_settings"] = {"images": 2}

# Define webdriver
driver = webdriver.Chrome(options=chrome_options)


# Load book links to scrape
with open('list_data_all_BBE.txt') as f:
    book_links = f.readlines()
    book_links = [literal_eval(x.strip('\n')) for x in book_links]
    
# Define necessary functions for scraper
# All functions using .find_element should catch NoSuchElementException! \
# otherwise a crash will occur if element is not present.
# Alternatively we could use .find_elements and check for non empty list
def get_bookId(string):
    bookId = string.split("/")[-1]
    return(bookId)
    
def get_title(driver):
    title = driver.find_element_by_id('bookTitle').text
    return(title)

def get_series(driver):
    series = driver.find_element_by_id('bookSeries').text.strip('()')
    return(series)
    
def get_author(driver):
    author = driver.find_element_by_id('bookAuthors').text.replace("by ", "")
    return(author)

def get_rating(driver):
    rating = str(driver.find_element_by_xpath('//span[@itemprop="ratingValue"]').text)
    return(rating)

def get_description(driver):
    if len(driver.find_elements_by_xpath('//*[(@id = "description")]//span')) > 1:
        description = driver.find_elements_by_xpath('//*[(@id = "description")]//span')[1].get_attribute('innerText')
    elif len(driver.find_elements_by_xpath('//*[(@id = "description")]//span')) == 1:
        description = driver.find_elements_by_xpath('//*[(@id = "description")]//span')[0].get_attribute('innerText')
    else:
        description = ""
    return(description)

def get_language(driver):
    try:
        language = driver.find_element_by_xpath('//*[@itemprop="inLanguage"]').get_attribute('innerText')
    except NoSuchElementException:
        language = ""
    return(language)

def get_isbn(driver):
    if len(driver.find_elements_by_xpath('//*[@itemprop="isbn"]')) != 0:
        isbn = driver.find_element_by_xpath('//*[@itemprop="isbn"]').get_attribute('innerText') 
    else:
        isbn = "9999999999999"
    return(isbn)

def get_genres(driver):
    genres = [] 
    for e in driver.find_elements_by_class_name("elementList"):
        try:
            genres.append(e.find_element(By.CLASS_NAME, 'left').text)
        except NoSuchElementException:
            pass
    genres = [x.split(' > ')[1] if '>' in x else x for x in genres]
    return(genres)

def get_bookFormat(driver):
    try:
        bookFormat = driver.find_element_by_xpath('//*[@itemprop="bookFormat"]').get_attribute('innerText')
    except NoSuchElementException:
        return("")
    return(bookFormat)

def get_edition(driver):
    try:
        edition = driver.find_element_by_xpath('//*[@itemprop="bookEdition"]').get_attribute('innerText')
    except NoSuchElementException:
        return("")
    return(edition)

def get_pages(driver):
    try:
        pages = driver.find_element_by_xpath('//*[@itemprop="numberOfPages"]').get_attribute('innerText')
        pages = pages.replace(" pages", "")
    except NoSuchElementException:
        pages = ""
    return(pages)

def get_characters(driver):
    characters = []
    for e in driver.find_elements_by_xpath('//a[contains(@href, "/characters/")]'):
        characters.append(e.get_attribute('innerText'))
    return(characters)

#############
# The following three functions are not nice \
# Might be possible to define just one function that returns all three values as all of them \
# are stored on the same element <- that would also ofuscate the code. WIP

def get_publisher(driver):
    publisher = ""   
    try:
        element = driver.find_element_by_xpath('(//div[@class="row"])[2]').get_attribute('innerText').split(" by ")
        if len(element) == 2:
            publisher = element[1].split(" (f")[0]
    except NoSuchElementException:
        pass
    return(publisher)

def get_publishDate(Driver):
    pDate = ""   
    try:
        element = driver.find_element_by_xpath('(//div[@class="row"])[2]').get_attribute('innerText').split(" by ")
        if len(element) == 2:
            pDate = element[0].replace("Published ", "")
        else:
            pDate = element[0].split("(")[0].replace("Published ", "")
        #if pDate != "":
        #        pDate = dateutil.parser.parse(pDate)
        #        pDate = pDate.strftime("%m/%d/%y")
    except NoSuchElementException:
        pass
    return(pDate)    
    
def get_firstPublishDate(Driver):
    for e in driver.find_elements_by_class_name('row'):
        if len(e.find_elements_by_tag_name('nobr')) != 0:
            pDate = e.find_element_by_tag_name('nobr').text
            pDate = pDate.split("shed ")[1].strip(")")
            pDate = dateutil.parser.parse(pDate)
            return(pDate.strftime("%m/%d/%y"))
    return("")

#############

def get_awards(driver):
    awards = []
    for i in driver.find_elements_by_class_name('award'):
        awards.append(i.get_attribute('innerText'))
    return(awards)

def get_numRatings(driver):
    try:
        numRatings = driver.find_element_by_xpath('//meta[@itemprop="ratingCount"]')
        numRatings = numRatings.get_attribute('content')
    except NoSuchElementException:
        return("")
    return(numRatings)
    
def get_ratingsByStars(driver):
    try:
        ratingsByStars = driver.find_element_by_xpath('//script[@type="text/javascript+protovis"]')
        ratingsByStars = ratingsByStars.get_attribute('innerText').split('[')[1].split(']')[0].split(', ')
    except NoSuchElementException:
        return([])
    return(ratingsByStars)

def get_likedPercent(driver):
    for attempt in range(10):
        try:
            driver.find_element_by_xpath('//a[@id="rating_details"]').click()
            WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//div[@class="close"]'))).click()
            likedPercent = driver.find_element_by_css_selector('#rating_distribution+ .value').get_attribute('innerText')
        except NoSuchElementException:
            return("")
        except TimeoutException:
            time.sleep(20+attempt*20)
        else:
            break
        
    return(likedPercent)

def get_setting(driver):
    setting = []
    for e in driver.find_elements_by_xpath('//a[contains(@href, "/places/")]'):
        if e.find_element_by_xpath("following-sibling::*").get_attribute("innerText") != "":
            setting.append(e.get_attribute('innerText') + " " + e.find_element_by_xpath("following-sibling::*").get_attribute("innerText"))
        else:
            setting.append(e.get_attribute('text'))
    setting = [x.replace("\n", "") for x in setting]
    return(setting)

def get_coverImg(driver):
    try:
        coverImg = driver.find_element_by_xpath('//img[@id="coverImage"]').get_attribute('src')
    except  NoSuchElementException:
        coverImg = ""
    return(coverImg)

# Time control
startTime = time.time()

# Book info extraction and broken link list
books = []
broken = []

# Define range of books <- JUST FOR TESTING
startBook = 0
endBook = 20000

# Iteration over a small range for testing, should be iterating over the full list.
for i in range(startBook, endBook):
    driver.get(book_links[i].get('bookUrl'))
    
    # Print some progress
    if i%500 == 0:
        print(i)
    elif i%100 == 0:
        print(" "+str(int((i-startBook)*100/(endBook-startBook)))+'% ', end = '')
    elif i%10 == 0:
        print('.', end = '') 
        
    # Wait for login popup and close (will open on second page) 
    if i == startBook+1:
        try:
            WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '(//img[@alt="Dismiss"])[2]'))).click()
        except TimeoutException:
            pass

    # Skip broken pages
    if driver.find_element_by_xpath('//head').get_attribute('innerText') == "":
        print("Broken")
        broken.append(book_links[i].get('bookUrl'))
        continue

    # Test for error, title always presents, if not found, wait and retry
    for attempt in range(10):
        try:
            title = get_title(driver)
        except NoSuchElementException:
            print("ooops, try: "+str(i))
            #driver.save_screenshot("error_"+str(i)+".png")
            time.sleep(20+attempt*20)
        else:
            break
    
    book = {
        "bookId": get_bookId(book_links[i].get('bookUrl')),
        "title": title,
        "series": get_series(driver),
        "author": get_author(driver),
        "rating": get_rating(driver),
        "description": get_description(driver),
        "language": get_language(driver),
        "isbn": get_isbn(driver),
        "genres": get_genres(driver),
        "characters": get_characters(driver),
        "bookFormat": get_bookFormat(driver),
        "edition": get_edition(driver),
        "pages": get_pages(driver),
        "publisher": get_publisher(driver),
        "publishDate": get_publishDate(driver),
        "firstPublishDate": get_firstPublishDate(driver),
        "awards": get_awards(driver),
        "numRatings": get_numRatings(driver),
        "ratingsByStars": get_ratingsByStars(driver),
        "likedPercent": get_likedPercent(driver),
        "setting": get_setting(driver),
        "coverImg": get_coverImg(driver),
        "bbeScore": book_links[i].get('score'),
        "bbeVotes": book_links[i].get('votes')
    }
    books.append(book)

    # Partial save
    if i%250 == 0:    
        with open('books_partial_20k.txt', 'w') as f:
            for book in books:
                f.write("%s\n" % book)  

# Time control    
endTime = time.time()
print("--- %s seconds ---" % (endTime - startTime))

# Save books to file for further development
with open('books_1_20k.txt', 'w') as f:
    for book in books:
        f.write("%s\n" % book)  

driver.close()

In [None]:
driver.close()

In [19]:
df = pd.DataFrame.from_records(books)
df

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"[Young Adult, Fiction, Dystopia, Fantasy, Scie...","[Katniss Everdeen, Peeta Mellark, Cato (Hunger...",...,09/14/08,,[Locus Award Nominee for Best Young Adult Book...,6376780,"[3444695, 1921313, 745221, 171994, 93557]",96,"[District 12, Panem, Capitol, Panem, Panem (Un...",https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"[Fantasy, Young Adult, Fiction, Magic, Childre...","[Sirius Black, Draco Malfoy, Ron Weasley, Petu...",...,09/27/04,06/21/03,[Bram Stoker Award for Works for Young Readers...,2507623,"[1593642, 637516, 222366, 39573, 14526]",98,[Hogwarts School of Witchcraft and Wizardry (U...,https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,9999999999999,"[Classics, Fiction, Historical Fiction, School...","[Scout Finch, Atticus Finch, Jem Finch, Arthur...",...,05/23/06,07/11/60,"[Pulitzer Prize for Fiction (1961), Audie Awar...",4501075,"[2363896, 1333153, 573280, 149952, 80794]",95,"[Maycomb, Alabama (United States)]",https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",4.26,Alternate cover edition of ISBN 9780679783268S...,English,9999999999999,"[Classics, Fiction, Romance, Historical Fictio...","[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabe...",...,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94,"[United Kingdom, Derbyshire, England (United K...",https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780316015844,"[Young Adult, Fantasy, Romance, Vampires, Fict...","[Edward Cullen, Jacob Black, Laurent, Renee, B...",...,09/06/06,10/05/05,"[Georgia Peach Book Award (2007), Buxtehuder B...",4964519,"[1751460, 1113682, 1008686, 542017, 548674]",78,"[Forks, Washington (United States), Phoenix, A...",https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
5,19063.The_Book_Thief,The Book Thief,,Markus Zusak (Goodreads Author),4.37,Librarian's note: An alternate cover edition c...,English,9780375831003,"[Historical Fiction, Fiction, Young Adult, His...","[Liesel Meminger, Hans Hubermann, Rudy Steiner...",...,03/14/06,09/01/05,[National Jewish Book Award for Children’s and...,1834276,"[1048230, 524674, 186297, 48864, 26211]",96,"[Molching (Germany), Germany]",https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
6,170448.Animal_Farm,Animal Farm,,"George Orwell, Russell Baker (Preface), C.M. W...",3.95,Librarian's note: There is an Alternate Cover ...,English,9780451526342,"[Classics, Fiction, Dystopia, Fantasy, Literat...","[Snowball, Napoleon, Clover, Boxer, Old Major,...",...,04/27/96,08/17/45,"[Prometheus Hall of Fame Award (2011), Retro H...",2740713,"[986764, 958699, 545475, 165093, 84682]",91,"[England, United Kingdom]",https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
7,11127.The_Chronicles_of_Narnia,The Chronicles of Narnia,The Chronicles of Narnia (Publication Order) #1–7,"C.S. Lewis, Pauline Baynes (Illustrator)",4.26,"Journeys to the end of the world, fantastic cr...",English,9999999999999,"[Fantasy, Classics, Fiction, Young Adult, Chil...","[Polly, Aslan, Lucy Pevensie, Edmund Pevensie,...",...,09/16/02,10/27/56,[],517740,"[254964, 167572, 74362, 15423, 5419]",96,"[London, England]",https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
8,30.J_R_R_Tolkien_4_Book_Boxed_Set,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,The Lord of the Rings #0-3,J.R.R. Tolkien,4.6,"This four-volume, boxed set contains J.R.R. To...",English,9780345538376,"[Fantasy, Fiction, Classics, Adventure, Scienc...","[Frodo Baggins, Gandalf, Bilbo Baggins, Gollum]",...,09/25/12,10/20/55,[],110146,"[78217, 22857, 6628, 1477, 967]",98,[Middle-earth],https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
9,18405.Gone_with_the_Wind,Gone with the Wind,,Margaret Mitchell,4.3,"Scarlett O'Hara, the beautiful, spoiled daught...",English,9780446675536,"[Classics, Historical Fiction, Fiction, Romanc...","[Scarlett O'Hara, Rhett Butler, Ashley Wilkes,...",...,04/01/99,06/30/36,"[Pulitzer Prize for Novel (1937), National Boo...",1074620,"[602138, 275517, 133535, 39008, 24422]",94,"[Atlanta, Georgia (United States)]",https://i.gr-assets.com/images/S/compressed.ph...,2993517,30513
