In [315]:
from bs4 import BeautifulSoup as bsoup
import urllib.robotparser
import requests
import pandas as pd
import numpy as np
from pprint import pprint
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
import dateparser

In [316]:
# Import GeckoDriverManager module.
from webdriver_manager.firefox import GeckoDriverManager

# Install the GeckoDriverManager to run FireFox web browser (for the first time!)
# driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())

# Once driver is installed, use this line:
driver = webdriver.Firefox(executable_path='./geckodriver.exe')


In [317]:
# Bandcamp Web Scraper Data format information

'''

scrape_data = pd.DataFrame(columns=['Release Title', 
                                    'Artist Name', 
                                    'Artist Location',
                                    'Release Date',
                                    'Release URL',
                                    'Release Genre',
                                    'Release Sub-Genre',
                                    'Search Format',
                                    'Search Week',
                                    'Search Category',
                                    'All Lyrics',
                                    'Number of Tracks'
                                    'Track Info',
                                    'Number of Fans',
                                    'Tags',
                                    'Scrape Date'])
                                    
### dictionary entry for track listing

track_info_entry = {
    'Track Title' : '',
    'Track Lyrics' : '',
    'Track Number' : '#',
    'Track Duration': '##:##'
}
                                    
'''

# valid weeks: [-1,0,678,677,676,675,674,673]
# today, this week, last week, 2, 3, 4, 5, 6 weeks ago
WEEK_DICT = {
    -1:'today',
    0:'this week',
    678:'last week',
    677:'2 weeks ago',
    676:'3 weeks ago',
    675:'4 weeks ago',
    674:'5 weeks ago',
    673:'6 weeks ago'
}

# scrape dataframe list to collect release entries
scrape_data = []

# all URLS collected so far (don't collect duplicate albums for multiple searches)
all_URLS_collected = []

In [318]:
# scrape bandcamp release page function
def scrape_release_page(URL,entry,output=False):
    
    entry = entry.copy()
    
    entry['Release URL'] = URL
    entry['Scrape Date'] = datetime.datetime.now().date()
    
    # don't add entry if it already exists in scrape dataframe
    if URL in all_URLS_collected:
        return
    else:
        all_URLS_collected.append(URL)
    
    if output==True:
        print('retrieving info for ' + URL + ' ...')

    headers = {'user-agent': 'Mozilla/5.0'}
    response = requests.get(URL,headers=headers)
    
    if response.status_code == 200:

        soup = bsoup(response.text, 'lxml')
        
        # Get title and artist name
        title = soup.find('meta',{'name':'title'})['content'].split(', by ')
        entry['Release Title'] = title[0]
        entry['Artist Name'] = title[1]
        
        # Get artist location
        entry['Artist Location'] = soup.find('span',{'class':'location secondaryText'}).string
        
        # Get release date
        release_date = soup.find('div',{'class':'tralbumData tralbum-credits'}).contents[0].string.strip('"').strip(' ').replace('releases','').replace('released','')
        release_date = dateparser.parse(release_date)
        entry['Release Date'] = release_date
        
        # Get tags
        tags = soup.find_all('a',{'class':'tag'})
        tags_list = []
        for tag in tags:
            tags_list.append(tag.string)
        entry['Tags'] = tags_list
        
        # If no genre in entry, leave blank
        if 'Release Genre' not in entry:
            entry['Release Genre'] = 'N/A'
            
        # If no subgenre in entry, leave blank
        if 'Release Sub-Genre' not in entry:
            entry['Release Sub-Genre'] = 'N/A'            
        
        # Get track information
        entry['Track Info'] = []
        #tracks = soup.find('table',{'id':'track_table'}).findChildren('tr', recursive=False)
        tracks = soup.find('table',{'id':'track_table'}).find_all('tr',{'class':'track_row_view'})
        
        all_lyrics = ''
        for track in tracks:
            # Track number
            track_num = int(track.find('td',{'class':'track-number-col'}).div.string.strip('.'))

            # Track title
            track_title = track.find('span',{'class':'track-title'})
            if track_title:
                track_title = track_title.string
            else:
                track_title = track.find('div',{'class':'title'}).span.string
        
            # Track duration
            track_duration = track.find('span',{'class':'time secondaryText'})
            if track_duration:
                track_duration = track_duration.string.replace('\n','').strip(' ')
            else:
                track_duration = 'N/A'  
                
            # Track lyrics
            track_lyrics = 'N/A'
            
            track_lyric_link = track.find('div',{'class':'info_link'}).a['href']            
            if track_lyric_link and '#lyrics' in track_lyric_link:
                lyric_tag = track.findNext('tr',{'class':'lyricsRow'})
                track_lyrics = lyric_tag.find('td',{'colspan':'4'}).div.string.strip('"').replace('\r\n','\n')
                all_lyrics += '\n' + track_lyrics
            
            # Add track object to tracks list
            track_obj = {'Track Title': track_title, 'Track Lyrics': track_lyrics,
                        'Track Number': track_num, 'Track Duration': track_duration}
            entry['Track Info'].append(track_obj)

        if all_lyrics != '':
            entry['All Lyrics'] = all_lyrics
        else:
            entry['All Lyrics'] = 'N/A'
        
        # Number of Tracks
        entry['Number of Tracks'] = len(entry['Track Info'])
        
        # Popularity Index
        entry['Number of Fans'] = 'N/A'
        driver.get(URL)
        foundAllFans = False
        fan_pages_searched = 0
        while foundAllFans == False and fan_pages_searched < MAX_FAN_PAGES:
            try:
                more_thumbs = driver.find_element_by_xpath('//a[@class="more-thumbs"]')
                fan_pages_searched += 1
                if 'display: none' in more_thumbs.get_attribute("style"):
                    foundAllFans = True
            except:
                foundAllFans = True
        
        if fan_pages_searched == MAX_FAN_PAGES:
            entry['Number of Fans'] = '>' + str(MAX_FANS)
        else:
            try:
                parentElement = driver.find_element_by_xpath('//div[@class="no-writing"]')
                elementList = parentElement.find_elements_by_tag_name("a")
                entry['Number of Fans'] = len(elementList)                   
            except:
                entry['Number of Fans'] = 0
                
        if output == True:
            #pprint(entry)
            #print()
            pass
        
        # add entry to scrape data
        scrape_data.append(entry)
    
    pass

In [319]:
# scrape search page function
# scrape the number of pages in the given search URL
def scrape_search_page(URL,num_pages=NUM_PAGES,output=False):

    # read parameters from link
    parameters = URL.split('?')[1:][0].split('&')
    genre = 'all'
    search_category = 'top'
    location = 0
    formatt = 'all'
    subgenre = 'all'
    week = 0
    for parameter in parameters:
        p = parameter.split('=')
        if p[0] == 'g':
            genre = p[1]
        if p[0] == 's':
            search_category = p[1]
        if p[0] == 'gn':
            location = int(p[1])
        if p[0] == 'f':
            formatt = p[1]
        if p[0] == 't':
            subgenre = p[1]
        if p[0] == 'w':
            week = int(p[1])
            
    entry['Search Format'] = formatt
    entry['Search Week'] = WEEK_DICT[week]
    entry['Search Category'] = search_category
    
    print('scraping ' + category + ' ' + genre + ' (' + subgenre + ') albums from location \'' + str(location) + '\' in ' + formatt + ' format for week ' + week + '...')
    
    entry = {'Release Genre': genre,
            'Release Sub-Genre': subgenre}
    
    driver.get(URL)
    time.sleep(INITIAL_SEARCH_WAIT)
    
    # extract URLs from link tags
    release_URLS = []
    
    for i in range(num_pages):
        # this is just to ensure that the page is loaded
        
        html = driver.page_source

        # create bsoup object
        soup = bsoup(str(html), 'lxml')

        # find album link tags on this page
        link_tags = soup.find_all('a', {'class':'item-title'})
    
        for tag in link_tags:
            tag_url = tag['href'].split('?')[0]
            if tag_url not in release_URLS and tag_url not in all_URLS_collected:
                release_URLS.append(tag_url)

        driver.find_element_by_xpath("//a[contains(text(), 'next')]").click()

    if output == True:
        pprint(release_URLS)

    for release_URL in release_URLS:
        scrape_release_page(release_URL,entry,output=output)

In [320]:
###################### USER INPUTS ######################################################

# list of genres to scrape
GENRES = [#'all',
          'rock',
          'metal',
          'alternative',
          'hip-hop-rap',
          'experimental',
          'punk',
          'pop',
          'acoustic',
          'funk',
          'country',
          'blues',
          'ambient',
          'soundtrack',
          'world',
          'jazz',
          'r-b-soul',
          'devotional',
          'classical',
          'reggae',
          'latin']

# scrape for subgenres within genres?
SCRAPE_SUBGENRES = False

# dictionary of subgenres to scrape, if scrape_subgenres = True
SUBGENRES = {
                'rock': ['indie','prog-rock','post-rock','rock-roll','psychedelic-rock'],
                'metal': [],
                'alternative': [],
                'hip-hop-rap': [],
                'experimental': [],
                'punk': [],
                'folk': [],
                'pop': [],
                'acoustic': [],
                'funk': [],
                'country': [],
                'blues': []
            }

# list of search categories to scrape ['top', 'new', 'rec']
#   top = best-selling
#   new = new arrivals
#   rec = artist-recommended
SEARCH_CATEGORIES = ['top','new']

# list of locations to scrape
# location = 0 returns search results for all locations
LOCATIONS = [0] 

# list of formats to scrape ['all','digital','vinyl','cd','cassette']
FORMATS = ['all']

# list of weeks to scrape
# valid weeks: [-1,0,678,677,676,675,674,673]
# today, this week, last week, 2, 3, 4, 5, 6 weeks ago
WEEKS = [0]

# include week parameter in search?
SCRAPE_WEEKS = False

# number of pages to scrape (10 produces 40-100 results)
NUM_PAGES = 10 

# Initial time to wait (in seconds) for Bandcamp Discover page to load
INITIAL_SEARCH_WAIT = 5

# retrieve no more than this many fans for popularity index
MAX_FANS = 1000

########################################################################################

# max number of pages of fan results to scrape before hitting MAX_FANS
MAX_FAN_PAGES = int(MAX_FANS / 60)


In [322]:
# main web scraper loop

for genre in GENRES:
    for category in SEARCH_CATEGORIES:
        for location in LOCATIONS:
            for formatt in FORMATS:
                for week in WEEKS:
                    if SCRAPE_WEEKS:
                        scrape_URL = 'https://bandcamp.com/?g=' + genre + '&s=' + category + '&p=0' + '&gn=' + str(location) + '&f=' + formatt + '&w=' + str(week)
                    else:
                        scrape_URL = 'https://bandcamp.com/?g=' + genre + '&s=' + category + '&p=0' + '&gn=' + str(location) + '&f=' + formatt
                        
                    scrape_search_page(scrape_URL,output=True)
                    
                    if SCRAPE_SUBGENRES and genre in SUBGENRES:
                        for subgenre in SUBGENRES[genre]:
                            subgenre_scrape_URL = scrape_URL + '&t=' + subgenre
                            scrape_search_page(subgenre_scrape_URL,output=False)

scraping top all (all) albums from location '0' in all format for week 0...
['https://thecaretaker.bandcamp.com/album/everywhere-at-the-end-of-time',
 'https://alchemyofflesh.bandcamp.com/album/ageless-abominations',
 'https://kingdude.bandcamp.com/album/beware-of-darkness',
 'https://magdalenabay.bandcamp.com/album/mercurial-world',
 'https://nugenea.bandcamp.com/album/marechi-with-c-lia-kameni',
 'https://stevehartlett.bandcamp.com/album/308',
 'https://taraka1111.bandcamp.com/album/welcome-to-paradise-lost',
 'https://deafheavens.bandcamp.com/album/infinite-granite',
 'https://0101.bandcamp.com/album/p-tisserie-snail',
 'https://selbalamir.bandcamp.com/album/swell',
 'https://tysegall.bandcamp.com/album/harmonizer',
 'https://crass.bandcamp.com/album/christ-alive-the-rehearsal',
 'https://tanyamorgan.bandcamp.com/album/don-and-von',
 'https://meandcassity.bandcamp.com/album/covers-four',
 'https://toytonics.bandcamp.com/album/italomania',
 'https://alixperez.bandcamp.com/album/burni

In [323]:
# JOHN KING CAVE scrapes

scrape_release_page('https://johnkingcave.bandcamp.com/album/devil-rides-beside',{'Release Genre':'folk'})
scrape_release_page('https://johnkingcave.bandcamp.com/album/oh-my-love',{'Release Genre':'folk'})
scrape_release_page('https://johnkingcave.bandcamp.com/album/emotion-tread-light',{'Release Genre':'folk'})
scrape_release_page('https://johnkingcave.bandcamp.com/album/720-split',{'Release Genre':'folk'})
scrape_release_page('https://johnkingcave.bandcamp.com/album/sing-a-song',{'Release Genre':'folk'})

# Bug: Doesn't work for tracks
#scrape_release_page('https://johnkingcave.bandcamp.com/track/i-love-america',{'Release Genre':'folk'})
#scrape_release_page('https://johnkingcave.bandcamp.com/track/shes-coming-down-the-line-2',{'Release Genre':'folk'})
#scrape_release_page('https://johnkingcave.bandcamp.com/track/labyrinth-of-faith',{'Release Genre':'folk'})


In [324]:
# Print Dataframe
scrape_df = pd.DataFrame(scrape_data)
scrape_df = scrape_df.drop_duplicates(subset = ["Release URL"])

scrape_df.head()

Unnamed: 0,Release Genre,Release Sub-Genre,Release URL,Release Title,Artist Name,Artist Location,Release Date,Tags,Track Info,Number of Tracks,Number of Fans
0,all,all,https://portrayalofguilt.bandcamp.com/album/po...,portrayal of guilt / Chat Pile Split,portrayal of guilt,"Austin, Texas",2021-08-17,"[punk, black metal, hardcore, metal, post-hard...","[{'Track Title': 'Touched by an Angel', 'Track...",2,>1000
1,all,all,https://johncarpentermusic.bandcamp.com/album/...,Halloween Kills OST,John Carpenter,"Los Angeles, California",2021-10-15,"[alternative, electronic, new age, prog-rock, ...","[{'Track Title': 'Logos Kill', 'Track Lyrics':...",20,>1000
2,all,all,https://tossportal.bandcamp.com/album/still-sl...,still slipping vol. 1,Joy Orbison,"London, UK",2021-08-13,"[electronic, London]","[{'Track Title': 'w/ dad & frankie', 'Track Ly...",14,>1000
3,all,all,https://chatpile.bandcamp.com/album/portrayal-...,portrayal of guilt / Chat Pile Split,Chat Pile,"Oklahoma City, Oklahoma",2021-08-17,"[death metal, hardcore, industrial, metal, noi...",[{'Track Title': 'portrayal of guilt - Touched...,2,>1000
4,all,all,https://cherubs.bandcamp.com/album/slo-blo-4-f...,SLO BLO 4 FRNZ & SXY,Cherubs,"Austin, Texas",2021-10-15,"[alternative, rock, alternative rock, indie ro...","[{'Track Title': 'Die Robbin'', 'Track Lyrics'...",10,>1000
...,...,...,...,...,...,...,...,...,...,...,...
79,folk,,https://johnkingcave.bandcamp.com/album/devil-...,Devil Rides Beside,John King Cave,"Albuquerque, New Mexico",2020-05-01,"[folk, alt-country, alternative, americana, el...","[{'Track Title': 'Half Moon', 'Track Lyrics': ...",10,8
80,folk,,https://johnkingcave.bandcamp.com/album/oh-my-...,Oh My Love,John King Cave,"Albuquerque, New Mexico",2020-12-25,"[folk, alt-country, country, country western, ...","[{'Track Title': 'Oh My Love (Theme)', 'Track ...",10,1
81,folk,,https://johnkingcave.bandcamp.com/album/emotio...,Emotion / Tread Light,John King Cave,"Albuquerque, New Mexico",2020-01-01,"[folk, alternative, electronica, indie, pop, A...","[{'Track Title': 'Emotion', 'Track Lyrics': 'E...",3,0
82,folk,,https://johnkingcave.bandcamp.com/album/720-split,720 Split,John King Cave,"Albuquerque, New Mexico",2020-06-12,"[folk, hip-hop, rap, pop, Albuquerque]",[{'Track Title': 'Gran Torino (feat. Naomi Van...,2,0


In [325]:
# save dataframe
#scrape_df.to_pickle('scrape_data.pkl')
scrape_df.to_csv('scrape_data.csv')
