In [86]:
from bs4 import BeautifulSoup as bsoup
import urllib.robotparser
import requests
import pandas as pd
import numpy as np
from pprint import pprint
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
import dateparser

In [17]:
# Import GeckoDriverManager module.
from webdriver_manager.firefox import GeckoDriverManager
# Install the GeckoDriverManager to run FireFox web browser.
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())



Current firefox version is 1 cannot be loaded. 
Get LATEST driver version for 1 cannot be loaded. 
Driver [C:\Users\Sam\.wdm\drivers\geckodriver\win64\v0.29.1\geckodriver.exe] found in cache


In [160]:
# formats to scrape, based on Bandcamp URL options
FORMATS = ['all',
           'digital',
           'vinyl',
           'cd',
           'cassette']

# search categories
#   top = best-selling
#   new = new arrivals
#   rec = artist-recommended
SEARCH_CATEGORIES = ['top','new','rec']

# scrape dataframe list
scrape_data = []
'''
scrape_data = pd.DataFrame(columns=['Release Title', 
                                    'Artist Name', 
                                    'Artist Location',
                                    'Release Date',
                                    'Release URL',
                                    'Release Genre',
                                    'Release Sub-Genre',
                                    'Number of Tracks'
                                    'Track Info',
                                    'Number of Fans',
                                    'Tags'])
'''

# dictionary entry for track listing
track_info_entry = {
    'Track Title' : '',
    'Track Lyrics' : '',
    'Track Number' : '',
    'Track Duration': ''
}

# retrieve no more than this many fans for popularity index
MAX_FANS = 1000
MAX_FAN_PAGES = int(MAX_FANS / 60)

In [161]:
# scrape bandcamp release page function
def scrape_release_page(URL,entry):
    
    entry['Release URL'] = URL
    
    # don't add entry if it already exists in scrape dataframe
    #if URL in scrape_data['Release URL']:
    #    return
    
    print('retrieving info for ' + URL + ' ...')

    headers = {'user-agent': 'Mozilla/5.0'}
    response = requests.get(URL,headers=headers)
    
    if response.status_code == 200:

        soup = bsoup(response.text, 'lxml')
        
        # Get number of tracks
        num_tracks = int(soup.find('meta',{'property':'og:description'})['content'].split(' ')[0])
        entry['Number of Tracks'] = num_tracks

        # Get title and artist name
        title = soup.find('meta',{'name':'title'})['content'].split(', by ')
        entry['Release Title'] = title[0]
        entry['Artist Name'] = title[1]
        
        # Get artist location
        entry['Artist Location'] = soup.find('span',{'class':'location secondaryText'}).string
        
        # Get release date
        release_date = soup.find('div',{'class':'tralbumData tralbum-credits'}).contents[0].string.strip('"').strip(' ').replace('releases','').replace('released','')
        release_date = dateparser.parse(release_date)
        entry['Release Date'] = release_date
        
        # Get tags
        tags = soup.find_all('a',{'class':'tag'})
        tags_list = []
        for tag in tags:
            tags_list.append(tag.string)
        entry['Tags'] = tags_list
        
        # Get Genre (if not unobtained already) from tags list
        if 'Release Genre' not in entry:
            entry['Release Genre'] = entry['Tags'][0]
            
        # If no subgenre in entry, leave blank
        if 'Release Sub-Genre' not in entry:
            entry['Release Sub-Genre'] = 'N/A'            
        
        # Get track information
        entry['Track Info'] = []
        #tracks = soup.find('table',{'id':'track_table'}).findChildren('tr', recursive=False)
        tracks = soup.find('table',{'id':'track_table'}).find_all('tr',{'class':'track_row_view'})
        
        for track in tracks:
            # Track number
            track_num = int(track.find('td',{'class':'track-number-col'}).div.string.strip('.'))

            # Track title
            track_title = track.find('span',{'class':'track-title'})
            if track_title:
                track_title = track_title.string
            else:
                track_title = track.find('div',{'class':'title'}).span.string
        
            # Track duration
            track_duration = track.find('span',{'class':'time secondaryText'})
            if track_duration:
                track_duration = track_duration.string.replace('\n','').strip(' ')
            else:
                track_duration = 'N/A'  
                
            # Track lyrics
            track_lyrics = 'N/A'
            track_lyric_link = track.find('div',{'class':'info_link'}).a['href']            
            if track_lyric_link and '#lyrics' in track_lyric_link:
                lyric_tag = track.findNext('tr',{'class':'lyricsRow'})
                track_lyrics = lyric_tag.find('td',{'colspan':'4'}).div.string.strip('"').replace('\r\n','\n')            
            
            # Add track object to tracks list
            track_obj = {'Track Title': track_title, 'Track Lyrics': track_lyrics,
                        'Track Number': track_num, 'Track Duration': track_duration}
            entry['Track Info'].append(track_obj)

        
        # Popularity Index
        entry['Number of Fans'] = 'N/A'
        driver.get(URL)
        foundAllFans = False
        fan_pages_searched = 0
        while foundAllFans == False and fan_pages_searched < MAX_FAN_PAGES:
            try:
                more_thumbs = driver.find_element_by_xpath('//a[@class="more-thumbs"]')
                fan_pages_searched += 1
                if 'display: none' in more_thumbs.get_attribute("style"):
                    foundAllFans = True
            except:
                foundAllFans = True
        
        if fan_pages_searched == MAX_FAN_PAGES:
            entry['Number of Fans'] = '>' + str(MAX_FANS)
        else:
            parentElement = driver.find_element_by_xpath('//div[@class="no-writing"]')
            elementList = parentElement.find_elements_by_tag_name("a")
            entry['Number of Fans'] = len(elementList)                   
                
        pprint(entry)
        print()
        
        # add entry to scrape data
        scrape_data.append(entry)
    
    pass

In [155]:
scrape_release_page('https://johnkingcave.bandcamp.com/album/devil-rides-beside',{})
scrape_release_page('https://music.sufjan.com/album/a-beginners-mind',{})

retrieving info for https://johnkingcave.bandcamp.com/album/devil-rides-beside ...
{'Artist Location': 'Albuquerque, New Mexico',
 'Artist Name': 'John King Cave',
 'Number of Fans': 8,
 'Number of Tracks': 12,
 'Release Date': datetime.datetime(2020, 5, 1, 0, 0),
 'Release Genre': 'folk',
 'Release Sub-Genre': 'N/A',
 'Release Title': 'Devil Rides Beside',
 'Release URL': 'https://johnkingcave.bandcamp.com/album/devil-rides-beside',
 'Tags': ['folk',
          'alt-country',
          'alternative',
          'americana',
          'electronica',
          'folk',
          'folk pop',
          'folk punk',
          'indie pop',
          'indie rock',
          'soul',
          'Albuquerque'],
 'Track Info': [{'Track Duration': '04:27',
                 'Track Lyrics': 'Well I got a half moon, \n'
                                 'Yes I got a half moon mistletoeing me \n'
                                 "Half moon, that's quite a half smile you got "
                             

{'Artist Location': 'New York, New York',
 'Artist Name': 'Sufjan Stevens & Angelo De Augustine',
 'Number of Fans': '>1000',
 'Number of Tracks': 14,
 'Release Date': datetime.datetime(2021, 9, 24, 0, 0),
 'Release Genre': 'folk',
 'Release Sub-Genre': 'N/A',
 'Release Title': "A Beginner's Mind",
 'Release URL': 'https://music.sufjan.com/album/a-beginners-mind',
 'Tags': ['folk', 'cinematic', 'movies', 'singer-songwriter', 'New York'],
 'Track Info': [{'Track Duration': '03:43',
                 'Track Lyrics': 'I have a memory \n'
                                 'Of a time and place where history resigned\n'
                                 'Now my apology  \n'
                                 'All the light came in to fulminate my mind\n'
                                 ' \n'
                                 'Reach out, reach out \n'
                                 'To all the ones who came before you\n'
                                 'Ponder what is right\n'
                 

In [170]:
scrape_search_page('https://bandcamp.com/?g=rock&s=top&p=2&gn=0&f=all&t=prog-rock',
                   {'Release Genre':'rock','Release Sub-Genre': 'prog-rock'})


[]


In [169]:
scrape_df = pd.DataFrame(scrape_data)
scrape_df = scrape_df.drop_duplicates(subset = ["Release URL"])
scrape_df

Unnamed: 0,Release Genre,Release Sub-Genre,Release URL,Number of Tracks,Release Title,Artist Name,Artist Location,Release Date,Tags,Track Info,Number of Fans
0,rock,prog-rock,https://lucidsins.bandcamp.com/album/cursed,8,Cursed!,Lucid Sins,"Scotland, UK",2021-05-03,"[retro rock, rock, doom rock, folk rock, harmo...","[{'Track Title': 'Joker's Dance', 'Track Lyric...",>1000
1,rock,prog-rock,https://oldmanwizard.com/album/kill-your-serva...,10,Kill Your Servants Quietly,Old Man Wizard,"San Diego, California",2021-11-05,"[rock, alternative, doom, folk, garage rock, g...","[{'Track Title': 'I Prayed', 'Track Lyrics': '...",61


In [166]:
# scrape search page function
def scrape_search_page(URL,entry):

    driver.get(URL)
    
    # this is just to ensure that the page is loaded
    time.sleep(REFRESH_RATE) 
    html = driver.page_source

    # create bsoup object
    soup = bsoup(str(html), 'lxml')    

    # find album link tags on this page
    link_tags = soup.find_all('a', {'class':'item-title'})
    pprint(link_tags)

    # extract URLs from link tags
    release_URLS = []
    for tag in link_tags:
        release_URLS.append(tag['href'].split('?')[0])
    
    for release_URL in release_URLS:
        scrape_release_page(release_URL,entry)
    

In [42]:
###################### INPUTS ################################

# list of genres to scrape
genres = ['all','rock']

# scrape for subgenres within genres?
scrape_subgenres = True

# dictionary of subgenres to scrape, if scrape_subgenres = True
SUBGENRES = {
                'rock': ['all','indie','prog-rock','post-rock','rock-roll','psychedelic-rock'],
                'metal': [],
                'alternative': [],
                'hip-hop-rap': [],
                'experimental': [],
                'punk': [],
                'folk': [],
                'pop': [],
                'acoustic': [],
                'funk': [],
                'country': [],
                'blues': []
            }

# list of search categories to scrape
search_categories = ['top']

# list of locations to scrape
# location = 0 returns search results for all locations
locations = [0] 

# list of formats to scrape
formats = ['all']

# number of pages to scrape
pages = 10 

# Refresh rate (if page isn't loading fast enough to retrieve links)
REFRESH_RATE = 5

###########################################################


In [85]:
# web scraper loop

for genre in genres:
    for category in search_categories:
        for location in locations:
            for formatt in formats:
                for page in range(pages):
                    
                    entry = {'Release Genre': genre}
                    
                    scrape_URL = 'https://bandcamp.com/?g=' + genre + '&s=' + category + '&p=' + str(page) + '&gn=' + str(location) + '&f=' + formatt
                    scrape_search_page(scrape_URL,entry)
                    
                    if scrape_subgenres and genre in SUBGENRES:
                        for subgenre in SUBGENRES[genre]:
                            entry['Release Sub-Genre'] = subgenre
                            subgenre_scrape_URL = scrape_URL + '&t=' + subgenre
                            scrape_search_page(subgenre_scrape_URL,entry)           

[<a class="item-title" data-bind="attr: { 'href': itemURL }, text: title, click: playMe" href="https://johncarpentermusic.bandcamp.com/album/halloween-kills-ost?from=discover-top">Halloween Kills OST</a>,
 <a class="item-title" data-bind="attr: { 'href': itemURL }, text: title, click: playMe" href="https://portrayalofguilt.bandcamp.com/album/portrayal-of-guilt-chat-pile-split?from=discover-top">portrayal of guilt / Chat Pile Split</a>,
 <a class="item-title" data-bind="attr: { 'href': itemURL }, text: title, click: playMe" href="https://tossportal.bandcamp.com/album/still-slipping-vol-1?from=discover-top">still slipping vol. 1</a>,
 <a class="item-title" data-bind="attr: { 'href': itemURL }, text: title, click: playMe" href="https://chatpile.bandcamp.com/album/portrayal-of-guilt-chat-pile-split?from=discover-top">portrayal of guilt / Chat Pile Split</a>,
 <a class="item-title" data-bind="attr: { 'href': itemURL }, text: title, click: playMe" href="https://cherubs.bandcamp.com/album/slo