# Web scraping script #1: fetching metadata from Audible

In [31]:
from bs4 import BeautifulSoup
import requests
import pickle
from re import search
from time import sleep
import pandas as pd
import sys

In [32]:
categories='https://www.audible.com/categories'
page=requests.get(categories)
categories_page= BeautifulSoup(page.content, 'html.parser')
aud_key='https://www.audible.com'

### Useful functions

In [33]:
def get_all_categories(categories_page):
    '''
    function to get all categories and sub-categories
    '''
    categories=[a for a in categories_page.findAll("div",{"class":"bc-col-responsive singleCategoryContainer bc-col-3"})]
    major_categories=dict()
    for i in categories:
        cat=i.findAll("a",{"class":"bc-link categoryLink bc-color-link"})[0].contents[0]
        subcat=i.findAll("a",{"class":"bc-link subCategoryLink bc-color-link"})
        major_categories[cat]=[]
        for n in subcat:
            web=aud_key+n['href']
            try:
                page=requests.get(web)
            except:
                try:
                    time.sleep(0.5)
                    page=requests.get(web)
                except:
                    pass
            sub_cat_page=BeautifulSoup(page.content, 'html.parser')
            all_books_link=get_all_books_subcat(sub_cat_page)
            major_categories[cat].append((all_books_link,n.contents[0]))
            #print ((all_books_link,n.contents[0]))
    return major_categories    

def get_node_id(major_categories):
    '''
    function to get audible keys for all categories and sub-categories
    '''
    node_categories=dict()
    for key,val in major_categories.items():
        node_categories[key]=[]
        for item in val:
            item_n=item[0].split('?')[-1].split('=')[-1]
            node_categories[key].append((item_n,item[1]))
    return node_categories

def get_all_books_subcat(sub_cat_page):
    all_books=sub_cat_page.find("div",{"class":'bc-col-responsive bc-text-right bc-col-4'}).find('a',{'class':'bc-link allInCategoryPageLink bc-color-link'})['href']
    return all_books

def get_dataframe(major_categories):
    '''
    get one large dataframe with all 22MB data
    '''
    lst=[]
    for key,val in major_categories.items():
        type_name=key
        for subtype in val:
            node=str(subtype[0])
            subtype_name=subtype[1]
            for page in range(0,25):
                weblink='https://www.audible.com/search?ref=a_search_c1_sort_5&pf_rd_p=073d8370-97e5-4b7b-be04-aa06cf22d7dd&pf_rd_r=QM0BY2YEWDHNKGKHYKAK&node='+node+'&feature_six_browse-bin=18685580011&feature_twelve_browse-bin=18685552011&sort=review-rank&pageSize=50&page='+str(page)
                try:
                    subtype_page=requests.get(weblink)
                except:
                    try:
                        time.sleep(0.5)
                        subtype_page=requests.get(weblink)
                    except:
                        pass
                sub_cat_page= BeautifulSoup(subtype_page.content, 'html.parser')
                #print ("Fetching information for %s category, %s sub category, %d page"%(type_name,subtype_name,page))
                lst=get_information_from_a_page(sub_cat_page,type_name,subtype_name,lst,page)
                
    final_df=pd.DataFrame(lst, columns=['Category','Sub Category','Book Name',\
                              'Book Link','Subtitle','Author Names','Narrator Names',\
                              'Runtime','Release Date','Ratings','Reviewers'])
    final_df.drop_duplicates(inplace=True).reset_index(inplace=True)
    return final_df

def save_data_by_cat(major_categories,category):
    '''
    get data for each catehory and save to csv
    '''
    lst=[];val=major_categories[category]
    type_name=category
    for subtype in val:
        node=str(subtype[0])
        subtype_name=subtype[1]
        for page in range(0,25):
            weblink='https://www.audible.com/search?ref=a_search_c1_sort_5&pf_rd_p=073d8370-97e5-4b7b-be04-aa06cf22d7dd&pf_rd_r=QM0BY2YEWDHNKGKHYKAK&node='+node+'&feature_six_browse-bin=18685580011&feature_twelve_browse-bin=18685552011&sort=review-rank&pageSize=50&page='+str(page)
            try:
                subtype_page=requests.get(weblink)
            except:
                try:
                    time.sleep(0.5)
                    subtype_page=requests.get(weblink)
                except:
                    pass
            sub_cat_page= BeautifulSoup(subtype_page.content, 'html.parser')
                #print ("Fetching information for %s category, %s sub category, %d page"%(type_name,subtype_name,page))
            lst=get_information_from_a_page(sub_cat_page,type_name,subtype_name,lst,page)
                
    final_df=pd.DataFrame(lst, columns=['Category','Sub Category','Book Name',\
                              'Book Link','Subtitle','Author Names','Narrator Names',\
                              'Runtime','Release Date','Ratings','Reviewers'])
    final_df.drop_duplicates(inplace=True) #.reset_index(inplace=True)
    final_df.to_csv(category.replace(" ", "")+'_df.csv')

def get_information_from_a_page(sub_cat_page,type_name,subtype_name,lst,page):
    '''
    given a link, get all relevant metadata from a page
    '''
    mydivs_page = [[a] for a in sub_cat_page.findAll("li",{"class":"bc-list-item productListItem"})]             
    for i in mydivs_page:
        book_name=i[0]["aria-label"]
        book_link=[b['href'] for a in i[0].findAll("li",{"class":'bc-list-item'}) for b in a.findAll("a")][0]
        subtitle_list=i[0].find("li",{"class":"bc-list-item subtitle"})
        if subtitle_list:
            subtitle_list=subtitle_list.findAll("span")
            subtitle=[i.contents[0] for i in subtitle_list][0]
        else:
            subtitle='None'
        author_list=i[0].find("li",{"class":"bc-list-item authorLabel"}).findAll("a") 
        author_names=','.join([i.contents[0] for i in author_list])
        try:
            narrator_list=i[0].find("li",{"class":"bc-list-item narratorLabel"}).findAll("a") 
            narrator_names=','.join([i.contents[0] for i in narrator_list])
        except:
            narrator_names='None'
        runtime_list=i[0].find("li",{"class":"bc-list-item runtimeLabel"}).findAll("span") 
        runtime=[i.contents[0] for i in runtime_list][0].split(':')[-1]
        release_list=i[0].find("li",{"class":"bc-list-item releaseDateLabel"}).findAll("span") 
        release=[i.contents[0] for i in release_list][0].split()[-1]
        ratings_list=i[0].find("li",{"class":"bc-list-item ratingsLabel"}).findAll("span",{"class":"bc-text bc-pub-offscreen"})
        rating=[i.contents[0] for i in ratings_list ]
        if len(rating)>0:
            rate=rating[0][0]
        else:
            rate='None'
        review_list=i[0].find("li",{"class":"bc-list-item ratingsLabel"}).findAll("span",{"class":"bc-text bc-size-small bc-color-secondary"})
        review=[i.contents[0] for i in review_list][0]
        if 'Not rated' in review:
            rev='None'
        else:
            rev=review.split()[0]
        lst.append([type_name,subtype_name,book_name,book_link,\
                   subtitle,author_names,narrator_names,runtime,\
                   release,rate,rev])
    return lst 

#### Pickle categories information

In [17]:
major_categories=get_all_categories(categories_page)
major_categories_node=get_node_id(major_categories)
sys.setrecursionlimit(10000)
pickle.dump(major_categories_node, open("categories.p", "wb"))

'Arts & Entertainment'

In [34]:
infile = open('categories.p','rb')
major_categories = pickle.load(infile)


### Write csv for sub categories

In [30]:
#test={'Arts & Entertainment': [('18571912011', 'Architecture')]}
for item,val in major_categories.items():
    get_dataframe_by_cat(major_categories,category=item)
    print ("Just wrote file for %s"%(item))

Just wrote file for Science & Engineering
Just wrote file for Science Fiction & Fantasy
Just wrote file for Sports & Outdoors
Just wrote file for Teen & Young Adult
Just wrote file for Travel & Tourism
