In [10]:
import requests
import csv 
from bs4 import BeautifulSoup
import lxml
import re
from tqdm import tqdm
import time

import pandas as pd

In [11]:
list_url= 'https://www.goodreads.com/list/show/134.Best_Non_Fiction_no_biographies_'

books_df = pd.DataFrame(columns = ['Title', 'Author', 'Description' 
                                   ,'Num_ratings', 'Avg_rating', 'Genres_list',
                                   'Main_genre', 'Secondary_genre' , 'Num_pages', 'ISBN','Link'])


In [12]:
def get_book_list(list_url, num_pages = 10):
    
    books_dict = {}
    
    for i in tqdm(range(1,num_pages)):
        bookpage=str(i)
        stuff=requests.get(list_url + '?page=' + bookpage)
        soup = BeautifulSoup(stuff.text, 'html.parser')

        for e in soup.select('.bookTitle'):
            books_dict[e.get_text(strip = 'True')] = e['href']

        time.sleep(2)
    
    return books_dict

In [13]:
def get_key(val, my_dict): 
    for key, value in my_dict.items(): 
         if val == value: 
                return key 
  
    return "key doesn't exist"

In [14]:
def get_main_genres(d):
    
    genre_dict = {}
    genres = d.select("div.elementList div.right a")
        
    for g in genres:
        genre_dict[g['href'].partition('=')[2]] = int(re.sub(',','',g.get_text().split()[0]))
        
    if len(genre_dict)>0:
        Max_genre = max(genre_dict, key= lambda x: genre_dict[x]) 
    else:
        Max_genre = None
    
    if len(genre_dict)>1:
        second_largest_value = list(sorted(genre_dict.values()))[-2]
    else:
        second_largest_value = None
    
    second_genre = get_key(second_largest_value, genre_dict)
        
    return genre_dict, Max_genre, second_genre

In [15]:
def get_book_details(book_url):
    
    book_dict = {}
    
    page=requests.get(book_url)
    soup=BeautifulSoup(page.text, 'html.parser')
    
    title = soup.select("meta[property='og:title']")[0]['content']
    author = soup.find_all('a', class_ = 'authorName')[0].get_text(strip = True)

    desc = soup.find('div', id = 'description')
    if desc is not None:
        try:
            description = desc.find_all('span')[1].get_text()
        except:
            description = desc.find_all('span')[0].get_text()
    else:
        description = None

    if soup.find('meta', itemprop = 'ratingCount') is not None:
        num_ratings = soup.find('meta', itemprop = 'ratingCount')['content']
    else:
        num_ratings = None

    if soup.find('span', itemprop = 'ratingValue') is not None:    
        rating = soup.find('span', itemprop = 'ratingValue').get_text(strip = 'True')
    else:
        rating = None

    genre_list, main_genre, secondary_genre = get_main_genres(soup)

    if soup.select("meta[property='books:page_count']") is not None and len(soup.select("meta[property='books:page_count']")) != 0: 
        num_pages = soup.select("meta[property='books:page_count']")[0]['content']
    else:
        num_pages = None

    if soup.select("meta[property='books:isbn']") is not None and len(soup.select("meta[property='books:isbn']")) != 0:
        isbn = soup.select("meta[property='books:isbn']")[0]['content']
    else:
        isbn = None

    book_dict = {
        'Title' : title,
        'Author' : author,
        'Description' : description,
        'Num_ratings' : num_ratings,
        'Avg_rating' : rating,
        'Genres_list' : genre_list,
        'Main_genre' : main_genre,
        'Secondary_genre': secondary_genre,
        'Num_pages' : num_pages,
        'ISBN' : isbn,
        'Link' : book_url
    }
    
    return book_dict

In [16]:
books_dict = {}

books_dict = get_book_list(list_url, num_pages = 3)

100%|██████████| 2/2 [00:16<00:00,  8.05s/it]


In [17]:
for book in tqdm(list(books_dict.values())):
    
    book_url = 'https://www.goodreads.com/' + str(book)
    print("Extracting from {}".format(book_url))
    books_df = books_df.append(get_book_details(book_url), ignore_index= True)

  0%|          | 0/200 [00:00<?, ?it/s]

Extracting from https://www.goodreads.com//book/show/21996.The_Devil_in_the_White_City


  0%|          | 1/200 [00:05<18:53,  5.69s/it]

Extracting from https://www.goodreads.com//book/show/1202.Freakonomics


  1%|          | 2/200 [00:11<18:39,  5.65s/it]

Extracting from https://www.goodreads.com//book/show/21.A_Short_History_of_Nearly_Everything


  2%|▏         | 3/200 [00:16<17:54,  5.46s/it]

Extracting from https://www.goodreads.com//book/show/1842.Guns_Germs_and_Steel


  2%|▏         | 4/200 [00:21<17:12,  5.27s/it]

Extracting from https://www.goodreads.com//book/show/3228917-outliers


  2%|▎         | 5/200 [00:26<17:00,  5.24s/it]

Extracting from https://www.goodreads.com//book/show/1898.Into_Thin_Air


  3%|▎         | 6/200 [00:31<17:11,  5.31s/it]

Extracting from https://www.goodreads.com//book/show/2612.The_Tipping_Point


  4%|▎         | 7/200 [00:37<17:05,  5.31s/it]

Extracting from https://www.goodreads.com//book/show/3109.The_Omnivore_s_Dilemma


  4%|▍         | 8/200 [00:42<17:13,  5.38s/it]

Extracting from https://www.goodreads.com//book/show/32145.Stiff


  4%|▍         | 9/200 [00:48<17:13,  5.41s/it]

Extracting from https://www.goodreads.com//book/show/1617.Night


  5%|▌         | 10/200 [00:51<15:32,  4.91s/it]

Extracting from https://www.goodreads.com//book/show/6493208-the-immortal-life-of-henrietta-lacks


  6%|▌         | 11/200 [00:55<14:12,  4.51s/it]

Extracting from https://www.goodreads.com//book/show/40102.Blink


  6%|▌         | 12/200 [00:58<13:09,  4.20s/it]

Extracting from https://www.goodreads.com//book/show/386187.Midnight_in_the_Garden_of_Good_and_Evil


  6%|▋         | 13/200 [01:02<12:51,  4.13s/it]

Extracting from https://www.goodreads.com//book/show/1097.Fast_Food_Nation


  7%|▋         | 14/200 [01:07<13:28,  4.35s/it]

Extracting from https://www.goodreads.com//book/show/14743.The_God_Delusion


  8%|▊         | 15/200 [01:13<14:55,  4.84s/it]

Extracting from https://www.goodreads.com//book/show/10847.Under_the_Banner_of_Heaven


  8%|▊         | 16/200 [01:19<15:57,  5.20s/it]

Extracting from https://www.goodreads.com//book/show/4069.Man_s_Search_for_Meaning


  8%|▊         | 17/200 [01:24<15:52,  5.21s/it]

Extracting from https://www.goodreads.com//book/show/3869.A_Brief_History_of_Time


  9%|▉         | 18/200 [01:30<15:46,  5.20s/it]

Extracting from https://www.goodreads.com//book/show/1869.Nickel_and_Dimed


 10%|▉         | 19/200 [01:33<14:11,  4.71s/it]

Extracting from https://www.goodreads.com//book/show/10569.On_Writing


 10%|█         | 20/200 [01:39<15:18,  5.10s/it]

Extracting from https://www.goodreads.com//book/show/63697.The_Man_Who_Mistook_His_Wife_for_a_Hat_and_Other_Clinical_Tales


 10%|█         | 21/200 [01:45<15:32,  5.21s/it]

Extracting from https://www.goodreads.com//book/show/2767.A_People_s_History_of_the_United_States


 11%|█         | 22/200 [01:48<14:06,  4.76s/it]

Extracting from https://www.goodreads.com//book/show/23692271-sapiens


 12%|█▏        | 23/200 [01:52<12:43,  4.31s/it]

Extracting from https://www.goodreads.com//book/show/105992.Helter_Skelter


 12%|█▏        | 24/200 [01:57<13:34,  4.63s/it]

Extracting from https://www.goodreads.com//book/show/5632446-columbine


 12%|█▎        | 25/200 [02:03<14:22,  4.93s/it]

Extracting from https://www.goodreads.com//book/show/33313.Kitchen_Confidential


 13%|█▎        | 26/200 [02:06<13:06,  4.52s/it]

Extracting from https://www.goodreads.com//book/show/6289283-born-to-run


 14%|█▎        | 27/200 [02:10<12:10,  4.22s/it]

Extracting from https://www.goodreads.com//book/show/25019.The_Professor_and_the_Madman


 14%|█▍        | 28/200 [02:14<11:55,  4.16s/it]

Extracting from https://www.goodreads.com//book/show/22463.The_Origin_of_Species


 14%|█▍        | 29/200 [02:18<12:14,  4.29s/it]

Extracting from https://www.goodreads.com//book/show/55030.Cosmos


 15%|█▌        | 30/200 [02:23<12:17,  4.34s/it]

Extracting from https://www.goodreads.com//book/show/61535.The_Selfish_Gene


 16%|█▌        | 31/200 [02:27<12:27,  4.42s/it]

Extracting from https://www.goodreads.com//book/show/475.Collapse


 16%|█▌        | 32/200 [02:31<11:35,  4.14s/it]

Extracting from https://www.goodreads.com//book/show/27333.Silent_Spring


 16%|█▋        | 33/200 [02:35<11:49,  4.25s/it]

Extracting from https://www.goodreads.com//book/show/25460.Animal_Vegetable_Miracle


 17%|█▋        | 34/200 [02:42<13:33,  4.90s/it]

Extracting from https://www.goodreads.com//book/show/11138.Mere_Christianity


 18%|█▊        | 35/200 [02:45<12:14,  4.45s/it]

Extracting from https://www.goodreads.com//book/show/2184798.Blood_River


 18%|█▊        | 36/200 [02:50<12:05,  4.42s/it]

Extracting from https://www.goodreads.com//book/show/8600.Eats_Shoots_Leaves


 18%|█▊        | 37/200 [02:54<12:18,  4.53s/it]

Extracting from https://www.goodreads.com//book/show/139069.Endurance


 19%|█▉        | 38/200 [03:00<13:14,  4.90s/it]

Extracting from https://www.goodreads.com//book/show/16902.Walden


 20%|█▉        | 39/200 [03:04<12:35,  4.69s/it]

Extracting from https://www.goodreads.com//book/show/17780.In_the_Heart_of_the_Sea


 20%|██        | 40/200 [03:11<14:13,  5.33s/it]

Extracting from https://www.goodreads.com//book/show/16213.The_Hot_Zone


 20%|██        | 41/200 [03:17<14:10,  5.35s/it]

Extracting from https://www.goodreads.com//book/show/4865.How_to_Win_Friends_and_Influence_People


 21%|██        | 42/200 [03:23<14:42,  5.59s/it]

Extracting from https://www.goodreads.com//book/show/296662.Lies_My_Teacher_Told_Me


 22%|██▏       | 43/200 [03:29<15:17,  5.84s/it]

Extracting from https://www.goodreads.com//book/show/315425.In_Defense_of_Food


 22%|██▏       | 44/200 [03:32<13:13,  5.09s/it]

Extracting from https://www.goodreads.com//book/show/1067.1776


 22%|██▎       | 45/200 [03:37<12:41,  4.91s/it]

Extracting from https://www.goodreads.com//book/show/1237300.The_Shock_Doctrine


 23%|██▎       | 46/200 [03:41<12:06,  4.72s/it]

Extracting from https://www.goodreads.com//book/show/48855.The_Diary_of_a_Young_Girl


 24%|██▎       | 47/200 [03:44<10:29,  4.11s/it]

Extracting from https://www.goodreads.com//book/show/1713426.Predictably_Irrational


 24%|██▍       | 48/200 [03:47<09:53,  3.91s/it]

Extracting from https://www.goodreads.com//book/show/2715.Salt


 24%|██▍       | 49/200 [03:51<09:28,  3.76s/it]

Extracting from https://www.goodreads.com//book/show/40961608-the-worst-hard-time


 25%|██▌       | 50/200 [03:55<09:45,  3.90s/it]

Extracting from https://www.goodreads.com//book/show/108229.The_Perfect_Storm


 26%|██▌       | 51/200 [04:00<10:36,  4.27s/it]

Extracting from https://www.goodreads.com//book/show/8520610-quiet


 26%|██▌       | 52/200 [04:05<11:04,  4.49s/it]

Extracting from https://www.goodreads.com//book/show/11472.We_Wish_to_Inform_You_That_Tomorrow_We_Will_Be_Killed_with_Our_Families


 26%|██▋       | 53/200 [04:11<11:46,  4.80s/it]

Extracting from https://www.goodreads.com//book/show/12609.The_Spirit_Catches_You_and_You_Fall_Down


 27%|██▋       | 54/200 [04:17<12:36,  5.18s/it]

Extracting from https://www.goodreads.com//book/show/6178648-nothing-to-envy


 28%|██▊       | 55/200 [04:21<12:09,  5.03s/it]

Extracting from https://www.goodreads.com//book/show/767171.The_Rise_and_Fall_of_the_Third_Reich


 28%|██▊       | 56/200 [04:25<10:50,  4.52s/it]

Extracting from https://www.goodreads.com//book/show/242472.The_Black_Swan


 28%|██▊       | 57/200 [04:31<11:59,  5.03s/it]

Extracting from https://www.goodreads.com//book/show/39020.1491


 29%|██▉       | 58/200 [04:36<11:58,  5.06s/it]

Extracting from https://www.goodreads.com//book/show/96123.All_the_President_s_Men


 30%|██▉       | 59/200 [04:40<11:23,  4.85s/it]

Extracting from https://www.goodreads.com//book/show/24.In_a_Sunburned_Country


 30%|███       | 60/200 [04:46<11:32,  4.94s/it]

Extracting from https://www.goodreads.com//book/show/33514.The_Elements_of_Style


 30%|███       | 61/200 [04:50<11:04,  4.78s/it]

Extracting from https://www.goodreads.com//book/show/239186.Isaac_s_Storm


 31%|███       | 62/200 [04:55<11:19,  4.92s/it]

Extracting from https://www.goodreads.com//book/show/28212.And_the_Band_Played_On


 32%|███▏      | 63/200 [05:01<11:36,  5.08s/it]

Extracting from https://www.goodreads.com//book/show/248787.The_World_Without_Us


 32%|███▏      | 64/200 [05:05<11:09,  4.92s/it]

Extracting from https://www.goodreads.com//book/show/76401.Bury_My_Heart_at_Wounded_Knee


 32%|███▎      | 65/200 [05:11<11:42,  5.20s/it]

Extracting from https://www.goodreads.com//book/show/347610.King_Leopold_s_Ghost


 33%|███▎      | 66/200 [05:16<11:39,  5.22s/it]

Extracting from https://www.goodreads.com//book/show/1301.Moneyball


 34%|███▎      | 67/200 [05:19<09:56,  4.48s/it]

Extracting from https://www.goodreads.com//book/show/12543.Bird_by_Bird


 34%|███▍      | 68/200 [05:23<09:07,  4.15s/it]

Extracting from https://www.goodreads.com//book/show/36086.The_Ghost_Map


 34%|███▍      | 69/200 [05:28<09:41,  4.44s/it]

Extracting from https://www.goodreads.com//book/show/7170627-the-emperor-of-all-maladies


 35%|███▌      | 70/200 [05:33<10:28,  4.83s/it]

Extracting from https://www.goodreads.com//book/show/17349.The_Demon_Haunted_World


 36%|███▌      | 71/200 [05:37<09:46,  4.54s/it]

Extracting from https://www.goodreads.com//book/show/28862.The_Prince


 36%|███▌      | 72/200 [05:42<10:07,  4.74s/it]

Extracting from https://www.goodreads.com//book/show/38855.Confederates_in_the_Attic


 36%|███▋      | 73/200 [05:48<10:36,  5.01s/it]

Extracting from https://www.goodreads.com//book/show/7603.Reading_Lolita_in_Tehran


 37%|███▋      | 74/200 [05:53<10:12,  4.86s/it]

Extracting from https://www.goodreads.com//book/show/4806.Longitude


 38%|███▊      | 75/200 [05:58<10:33,  5.07s/it]

Extracting from https://www.goodreads.com//book/show/2082136.Bonk


 38%|███▊      | 76/200 [06:03<10:10,  4.92s/it]

Extracting from https://www.goodreads.com//book/show/6402364-superfreakonomics


 38%|███▊      | 77/200 [06:09<10:38,  5.19s/it]

Extracting from https://www.goodreads.com//book/show/29.The_Mother_Tongue


 39%|███▉      | 78/200 [06:15<11:06,  5.46s/it]

Extracting from https://www.goodreads.com//book/show/586472.The_Right_Stuff


 40%|███▉      | 79/200 [06:21<11:38,  5.77s/it]

Extracting from https://www.goodreads.com//book/show/30474.The_Communist_Manifesto


 40%|████      | 80/200 [06:26<11:16,  5.63s/it]

Extracting from https://www.goodreads.com//book/show/13839.The_Botany_of_Desire


 40%|████      | 81/200 [06:34<12:08,  6.12s/it]

Extracting from https://www.goodreads.com//book/show/36072.The_7_Habits_of_Highly_Effective_People


 41%|████      | 82/200 [06:39<11:29,  5.84s/it]

Extracting from https://www.goodreads.com//book/show/9938498-in-the-garden-of-beasts


 42%|████▏     | 83/200 [06:43<10:25,  5.34s/it]

Extracting from https://www.goodreads.com//book/show/2199.Team_of_Rivals


 42%|████▏     | 84/200 [06:49<10:46,  5.57s/it]

Extracting from https://www.goodreads.com//book/show/18956.Homicide


 42%|████▎     | 85/200 [06:53<09:50,  5.14s/it]

Extracting from https://www.goodreads.com//book/show/24113.G_del_Escher_Bach


 43%|████▎     | 86/200 [06:57<09:00,  4.74s/it]

Extracting from https://www.goodreads.com//book/show/568236.A_Distant_Mirror


 44%|████▎     | 87/200 [07:03<09:31,  5.06s/it]

Extracting from https://www.goodreads.com//book/show/18300212-the-trigger


 44%|████▍     | 88/200 [07:09<09:59,  5.36s/it]

Extracting from https://www.goodreads.com//book/show/11366.The_Guns_of_August


 44%|████▍     | 89/200 [07:15<10:13,  5.53s/it]

Extracting from https://www.goodreads.com//book/show/7940583-chasing-the-devil


 45%|████▌     | 90/200 [07:23<11:39,  6.36s/it]

Extracting from https://www.goodreads.com//book/show/16884.The_Making_of_the_Atomic_Bomb


KeyboardInterrupt: 

In [290]:
books_df

Unnamed: 0,Title,Author,Description,Num_ratings,Avg_rating,Genres_list,Main_genre,Secondary_genre,Num_pages,ISBN,Link
0,Diplomacy,Henry Kissinger,"A brilliant, sweeping history of diplomacy tha...",6452,4.22,"{'history': 391, 'politics': 325, 'non-fiction...",history,politics,912,9780671510992,https://www.goodreads.com//book/show/781183.Di...
1,The Happiness Advantage,Shawn Achor,Our most commonly held formula for success is ...,23766,4.18,"{'non-fiction': 555, 'psychology': 517, 'self-...",non-fiction,psychology,272,9780307591562,https://www.goodreads.com//book/show/9484114-t...
2,The Stuff of Thought,Steven Pinker,New York Times bestselling author Steven Pinke...,9712,3.90,"{'non-fiction': 593, 'science': 412, 'psycholo...",non-fiction,science,499,9780670063277,https://www.goodreads.com//book/show/373969.Th...
3,Civilization and Its Discontents,Sigmund Freud,It stands as a brilliant summary of the views ...,30963,3.79,"{'psychology': 1218, 'philosophy': 909, 'non-f...",psychology,philosophy,127,9780393301588,https://www.goodreads.com//book/show/357636.Ci...
4,The Corporation,Joel Bakan,The inspiration for the film that won the 2004...,4084,4.07,"{'non-fiction': 196, 'economics': 119, 'politi...",non-fiction,economics,228,9780743247467,https://www.goodreads.com//book/show/108583.Th...
...,...,...,...,...,...,...,...,...,...,...,...
306,A Treasury of Deception,Michael Farquhar,"We may say that honesty is the best policy, bu...",657,3.62,"{'history': 84, 'non-fiction': 62, 'historical...",history,non-fiction,304,9780143035442,https://www.goodreads.com//book/show/401503.A_...
307,The Girls Who Went Away,Ann Fessler,A powerful and groundbreaking revelation of th...,5819,4.19,"{'non-fiction': 438, 'history': 279, 'feminism...",non-fiction,history,354,9781594200946,https://www.goodreads.com//book/show/116477.Th...
308,Fatal Vision,Joe McGinniss,Fatal Vision is the electrifying true story of...,22182,4.11,"{'true-crime': 739, 'non-fiction': 306, 'crime...",true-crime,non-fiction,684,9780451165664,https://www.goodreads.com//book/show/333907.Fa...
309,The Divided Self,R.D. Laing,"In The Divided Self (1960), Laing contrasted t...",4869,4.09,"{'psychology': 463, 'non-fiction': 182, 'philo...",psychology,non-fiction,224,9780140135374,https://www.goodreads.com//book/show/496585.Th...


In [292]:
books_df.to_csv('scraped_books.csv')