In [1]:
import requests, re
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time, random
from urllib.request import urlopen
import urllib.parse
import numpy as np
from IPython.display import display

In [2]:
def get_soup(url): # Retrieves soup from website using py requests module and 'person' headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://duckduckgo.com/'
        }
    response = requests.get(url, headers=headers)
    status = response.status_code
    if status == 200:
      page = response.text
      soup = bs(page)
    else:
      print(f"Oops! Received status code {status}")
    return soup

In [30]:
def scrape_goodreads(url): # returns a df containing titles, ratings, urls, date published
    soup = get_soup(url)
    list_results = []
    
    # get list of titles
    soup_titles = soup.find_all('a', class_='bookTitle')
    titles = [title.text.strip() for title in soup_titles]
    
    # get authors
    author_info = soup.find_all('a', class_='authorName')
    authors = [author.text for author in author_info]
    author_urls = [author.get('href') for author in author_info]
    
    # get list of ratings 
    stars = soup.find_all(class_='minirating')
    stars = [star.text.split('—') for star in stars]

    # some stars have messed up title starting with a string. Clean this up.
    for star in stars:
        s1 = star[0].strip()
        star[0] = star[0].strip()
        m = re.search(r"\d", s1)
        index = m.start()
        if index:
            star[0] = star[0][index:]

        
    # clean up extra white spaces   
    cleaned_stars = []
    for line in stars:
        transfer = []
        for item in line:
            transfer += [item.strip()]
        cleaned_stars += [transfer]

    
    
    # get year published
#     dates = soup.find_all(class_='minirating')
#     dates = [date.parent.text.split()[-1] for date in dates]
    
    # get url for GR page for specific title
    urls = [url.get('href') for url in soup_titles]
    
    #assemble list of dicts for each title
    for i in range(len(titles)):
        results = {}
        title = titles[i]
        results['title'] = title
        results['author'] = authors[i]
        results['author_url'] = author_urls[i]
        
        if cleaned_stars[i][0][0].isnumeric():       
            results['stars'] = cleaned_stars[i][0]
        else: results['stars'] = 'error'
        results['number of ratings'] = cleaned_stars[i][1]
#         results['year published'] = dates[i]
        results['url'] = urls[i]
        left = '(' in titles[i]
        right = ')' in titles[i]
        pound = '#' in titles[i]
        if left and right and pound:
            results['is_series'] = True
            
            # clean up title
            pound_ind = title.index('#')
            right_ind = title.index(')', pound_ind)
            title = title[::-1]
            left_ind = title.index('(', title.index('#'))
            left_ind = len(title) - left_ind - 1
            title = title[::-1]
            title = title[:left_ind] + title[right_ind + 1:]
            title = title.strip()
            results['title'] = title

        else: results['is_series'] = False


        list_results += [results]

    # convert list of dicts into df
    list_results = pd.DataFrame(list_results)
    
    # clean and return df
    return list_results

    



In [23]:
def get_good_reads(first_page_url):
    base_url = first_page_url[:-1]
    df = pd.DataFrame()
    pages = list(range(1,11))
    random.shuffle(pages)
    for page_number in pages:
        url = base_url + str(page_number)
        df = df.append(scrape_goodreads(url))
        df.reset_index(drop=True, inplace=True)
#         display(df)
        df.to_csv('Figures/master_GR_df.csv')
        time.sleep(10 + random.uniform(-1,1))
    

In [22]:
get_good_reads('https://www.goodreads.com/list/show/1.Best_Books_Ever?page=8')

Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,The Sisterhood of the Traveling Pants,Ann Brashares,3.81 avg rating,"664,248 ratings",/book/show/452306.The_Sisterhood_of_the_Travel...,True
96,Tempted,P.C. Cast,3.94 avg rating,"162,671 ratings",/book/show/6262365-tempted,True
97,Howl and Other Poems,Allen Ginsberg,4.12 avg rating,"101,515 ratings",/book/show/6295.Howl_and_Other_Poems,False
98,Jitterbug Perfume,Tom Robbins,4.19 avg rating,"74,960 ratings",/book/show/8682.Jitterbug_Perfume,False


Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,A Room with a View,E.M. Forster,3.91 avg rating,"162,567 ratings",/book/show/3087.A_Room_with_a_View,False
96,The 5th Wave,Rick Yancey,4.02 avg rating,"429,786 ratings",/book/show/16101128-the-5th-wave,True
97,Bleak House,Charles Dickens,4.00 avg rating,"105,472 ratings",/book/show/31242.Bleak_House,False
98,Snow Falling on Cedars,David Guterson,3.85 avg rating,"177,376 ratings",/book/show/77142.Snow_Falling_on_Cedars,False


Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,Thoughtless,S.C. Stephens,4.09 avg rating,"137,872 ratings",/book/show/13517535-thoughtless,True
96,Factotum,Charles Bukowski,3.93 avg rating,"68,700 ratings",/book/show/497199.Factotum,False
97,The 5 Love Languages: The Secret to Love that ...,Gary Chapman,4.25 avg rating,"350,393 ratings",/book/show/23878688-the-5-love-languages,False
98,Bitten,Kelley Armstrong,4.00 avg rating,"101,756 ratings",/book/show/11918.Bitten,True


Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,Murder on the Orient Express,Agatha Christie,4.19 avg rating,"468,866 ratings",/book/show/853510.Murder_on_the_Orient_Express,True
96,The Clan of the Cave Bear,Jean M. Auel,4.07 avg rating,"245,498 ratings",/book/show/40611463-the-clan-of-the-cave-bear,True
97,Throne of Glass,Sarah J. Maas,4.18 avg rating,"785,417 ratings",/book/show/7896527-throne-of-glass,True
98,The Canterbury Tales,Geoffrey Chaucer,3.52 avg rating,"203,891 ratings",/book/show/2696.The_Canterbury_Tales,False


Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,"Everything, Everything",Nicola Yoon,4.02 avg rating,"488,177 ratings",/book/show/18692431-everything-everything,False
96,The Crying of Lot 49,Thomas Pynchon,3.68 avg rating,"80,587 ratings",/book/show/2794.The_Crying_of_Lot_49,False
97,Magician,Raymond E. Feist,4.27 avg rating,"66,598 ratings",/book/show/43916.Magician,True
98,The Forty Rules of Love,Elif Shafak,4.13 avg rating,"149,457 ratings",/book/show/6642715-the-forty-rules-of-love,False


Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,World Without End,Ken Follett,4.26 avg rating,"225,485 ratings",/book/show/5064.World_Without_End,True
96,The Glass Menagerie,Tennessee Williams,3.71 avg rating,"121,402 ratings",/book/show/92517.The_Glass_Menagerie,False
97,Speaker for the Dead,Orson Scott Card,4.07 avg rating,"237,137 ratings",/book/show/7967.Speaker_for_the_Dead,True
98,Sense and Sensibility,Jane Austen,4.08 avg rating,"1,048,592 ratings",/book/show/14935.Sense_and_Sensibility,False


Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,I Am the Messenger,Markus Zusak,4.02 avg rating,"154,738 ratings",/book/show/19057.I_Am_the_Messenger,False
96,The Blood of Olympus,Rick Riordan,4.40 avg rating,"250,957 ratings",/book/show/18705209-the-blood-of-olympus,True
97,Small Gods,Terry Pratchett,4.30 avg rating,"107,794 ratings",/book/show/34484.Small_Gods,True
98,Mystic River,Dennis Lehane,4.13 avg rating,"146,332 ratings",/book/show/21671.Mystic_River,False


Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,Robinson Crusoe,Daniel Defoe,3.68 avg rating,"269,538 ratings",/book/show/2932.Robinson_Crusoe,True
96,Harry Potter Series Box Set,J.K. Rowling,4.72 avg rating,"269,448 ratings",/book/show/862041.Harry_Potter_Series_Box_Set,True
97,Midnight's Children,Salman Rushdie,3.98 avg rating,"111,295 ratings",/book/show/14836.Midnight_s_Children,False
98,The English Patient,Michael Ondaatje,3.87 avg rating,"122,214 ratings",/book/show/11713.The_English_Patient,False


Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,Invisible Man,Ralph Ellison,3.89 avg rating,"168,139 ratings",/book/show/16981.Invisible_Man,False
96,The Things They Carried,Tim O'Brien,4.13 avg rating,"277,116 ratings",/book/show/133518.The_Things_They_Carried,False
97,James and the Giant Peach,Roald Dahl,4.01 avg rating,"423,472 ratings",/book/show/6689.James_and_the_Giant_Peach,False
98,The Complete Works,William Shakespeare,4.46 avg rating,"54,051 ratings",/book/show/569564.The_Complete_Works,False


Unnamed: 0,title,author,stars,number of ratings,url,is_series
0,Night,Elie Wiesel,4.34 avg rating,"1,076,237 ratings",/book/show/1617.Night,True
1,The Westing Game,Ellen Raskin,4.00 avg rating,"195,985 ratings",/book/show/902.The_Westing_Game,False
2,Hunted,P.C. Cast,3.91 avg rating,"178,631 ratings",/book/show/4134071-hunted,True
3,We Need to Talk About Kevin,Lionel Shriver,4.04 avg rating,"188,422 ratings",/book/show/80660.We_Need_to_Talk_About_Kevin,False
4,The Bean Trees,Barbara Kingsolver,3.98 avg rating,"140,989 ratings",/book/show/30868.The_Bean_Trees,True
...,...,...,...,...,...,...
95,The Notebook,Nicholas Sparks,4.12 avg rating,"1,464,216 ratings",/book/show/33648131-the-notebook,True
96,Winnie-the-Pooh,A.A. Milne,4.32 avg rating,"301,205 ratings",/book/show/99107.Winnie_the_Pooh,True
97,The Complete Stories and Poems,Edgar Allan Poe,4.37 avg rating,"233,410 ratings",/book/show/23919.The_Complete_Stories_and_Poems,False
98,Interview with the Vampire,Anne Rice,4.00 avg rating,"526,137 ratings",/book/show/43763.Interview_with_the_Vampire,True


In [24]:
def get_good_reads_2(first_page_url):
    base_url = first_page_url[:-1]
    df = pd.DataFrame()
    pages = list(range(10,21))
    random.shuffle(pages)
    for page_number in pages:
        url = base_url + str(page_number)
        df = df.append(scrape_goodreads(url))
        df.reset_index(drop=True, inplace=True)
#         display(df)
        df.to_csv('Figures/master_GR_df_2.csv')
        time.sleep(10 + random.uniform(-1,1))
    

In [31]:
get_good_reads_2('https://www.goodreads.com/list/show/1.Best_Books_Ever?page=8')

In [32]:
testDF = scrape_goodreads('https://www.goodreads.com/search?q=rationality+steven+pinker')

In [33]:
testDF

Unnamed: 0,title,author,author_url,stars,number of ratings,url,is_series
0,SUMMARY OF RATIONALITY BY STEVEN PINKER: What ...,Wilson Gabriels,https://www.goodreads.com/author/show/21854615...,0.00 avg rating,0 ratings,/book/show/59349571-summary-of-rationality-by-...,False
1,Workbook: Rationality by Steven Pinker (IKPA):...,IKPA,https://www.goodreads.com/author/show/22023315...,0.00 avg rating,0 ratings,/book/show/59699961-workbook?from_search=true&...,False
2,Summary Analysis of Rationality By Steven Pink...,Caleb Reads,https://www.goodreads.com/author/show/21506417...,0.00 avg rating,0 ratings,/book/show/59819996-summary-analysis-of-ration...,False
3,"Rationality: What It Is, Why It Seems Scarce, ...",Steven Pinker,https://www.goodreads.com/author/show/3915.Ste...,3.89 avg rating,"1,396 ratings",/book/show/56224080-rationality?from_search=tr...,False
4,"Summary Of Rationality: What It Is, Why It See...",James Baden,https://www.goodreads.com/author/show/21836599...,1.00 avg rating,2 ratings,/book/show/59131661-summary-of-rationality?fro...,False
5,Steven Pinker Collection 3 Books Set (Enlighte...,Steven Pinker,https://www.goodreads.com/author/show/3915.Ste...,0.00 avg rating,0 ratings,/book/show/59149426-steven-pinker-collection-3...,False
6,"Rationality: What It Is, Why It Seems Scarce, ...",Pinker Steven,https://www.goodreads.com/author/show/21936798...,4.00 avg rating,3 ratings,/book/show/59406681-rationality?from_search=tr...,False
7,"Summary of Rationality: What It Is, Why It's S...",BestPrint,https://www.goodreads.com/author/show/21345922...,0.00 avg rating,0 ratings,/book/show/59848357-summary-of-rationality?fro...,False
8,Summary of Steven Pinker’s Rationality,Bolt Books,https://www.goodreads.com/author/show/22087178...,0.00 avg rating,0 ratings,/book/show/60071748-summary-of-steven-pinker-s...,False


In [34]:
import scrapeFunc

In [35]:
dir(scrapeFunc)

['Options',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'bs',
 'display',
 'get_pages_year_authorAttr',
 'get_soup',
 'get_soup_driver',
 'get_soup_driver_proxy',
 'get_soup_proxy',
 'import_df',
 'np',
 'pd',
 'random',
 're',
 'requests',
 'scrape_goodreads',
 'search_imdb',
 'time',
 'urllib',
 'urlopen',
 'webdriver']

In [36]:
driver.quit()

NameError: name 'driver' is not defined