In [1]:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import time

In [2]:
def discogs_to_dict(html):
    """Takes a the html of a discogs release page and extracts the information into a dict.
    If there are multiple entries for one field, seperate with semicolons within a string"""
    
    #Create the dict
    details_dict = {}
    #Read into BeautifulSoup
    album_soup = BeautifulSoup(html, 'html.parser')
    
    #Extract the artist
    artist = ' '.join(list(album_soup.find_all("h1", id = 'profile_title')[0]
                                           .stripped_strings)[0:-2]).replace(' , ', ', ')
    #If a compilation take the artists from each track, otherwise keep the artist found above
    if artist == 'Various':
        track_artists = album_soup.find_all("td", class_ = "tracklist_track_artists")
        details_dict['artist'] = '; '.join([(list(track_artist.stripped_strings)[1] 
                                             if len(list(track_artist.stripped_strings))>1 
                                             else '')
                                            for track_artist in track_artists])
        details_dict['compilation'] = True
    else:
        details_dict['artist'] = artist
        details_dict['compilation'] = False
    
    #Extract a few straightforward fields
    details_dict['title'] = list(album_soup.find_all("h1", id = 'profile_title')[0].stripped_strings)[-1]
    details_dict['label'] = list(album_soup.find_all("div", class_ = 'content')[0].stripped_strings)[0]
    details_dict['release'] = ''.join(list(album_soup.find_all("div", class_ = 'content')[0].stripped_strings)[1].split(' ')[1:])

    #The first item in Format is the import one (vinyl, cd, 8-track etc.) so store this separately
    format_list = ' '.join(list(album_soup.find_all("div", class_ = 'content')[-5].stripped_strings)).split(',')
    details_dict['format'] = format_list[0][:-1]
    details_dict['format details'] = ';'.join(format_list[1:])[1:]

    #Extract more straightforward fields
    details_dict['country'] = '; '.join(list(album_soup.find_all("div", class_ = 'content')[-4].stripped_strings)[:])
    details_dict['year'] = '; '.join(list(album_soup.find_all("div", class_ = 'content')[-3].stripped_strings)[:])
    details_dict['genre'] = '; '.join(list(album_soup.find_all("div", class_ = 'content')[-2].stripped_strings)[0::2])
    details_dict['style'] = '; '.join(list(album_soup.find_all("div", class_ = 'content')[-1].stripped_strings)[0::2])
    details_dict['number of tracks'] = len(album_soup.find_all("td", class_="tracklist_track_pos"))
    details_dict['track names'] = '; '.join([x.get_text() 
                                             for x in album_soup.find_all("span", class_ = "tracklist_track_title")])

    #These next fields are not always present
    notes = album_soup.find_all("div", id ="notes")
    if notes:
        details_dict['notes'] = ' '.join(list(album_soup.find_all("div", id ="notes")[0].stripped_strings)[1:])
    else:
        details_dict['notes'] = ''
        
    versions = album_soup.find_all("div", id ="m_versions")
    if versions:
        details_dict['number of versions'] = int(list(versions[0].stripped_strings)[0].split(' ')[-1][:-1])
        details_dict['countries of versions'] = '; '.join([x.get_text() 
                                                           for x in versions[0].find_all("td", class_ = "country has_header")])
        details_dict['years of versions'] = '; '.join([x.get_text() 
                                                       for x in versions[0].find_all("td", class_ = "year has_header")])
    else:
        details_dict['number of versions'] = 0
        details_dict['countries of versions'] = ''
        details_dict['years of versions'] = ''
        
    for_sale = album_soup.find_all("span", itemprop = "offers")
    if for_sale and len(list(for_sale[0].stripped_strings)) > 0:
        for_sale = list(for_sale[0].stripped_strings)
        details_dict['number for sale'] = int(for_sale[0].split(' ')[0].replace(',',''))
        details_dict['lowest price for sale'] = float(for_sale[2][1:].replace(',',''))
    else:
        details_dict['number for sale'] = 0
        details_dict['lowest price for sale'] = None

    #Get the previous sale information
    statistics = list(album_soup.find_all("div", id = "statistics")[0].stripped_strings)
    details_dict['number have'] = int(statistics[2].replace(',',''))
    details_dict['number want'] = int(statistics[4].replace(',',''))
    details_dict['average rating'] = None if statistics[6] == '--' else float(statistics[6])
    details_dict['number of ratings'] = int(statistics[9])
    details_dict['last sold'] = None if statistics[11] == 'Never' else statistics[11]
    details_dict['lowest price sold'] = None if statistics[13] == '--' else float(statistics[13][1:].replace(',',''))
    details_dict['median price sold'] = None if statistics[15] == '--' else float(statistics[15][1:].replace(',',''))
    details_dict['highest price sold'] = None if statistics[17] == '--' else float(statistics[17][1:].replace(',',''))
    
    #Get the html of the label page
    label_url = album_soup.find_all("div", class_ = 'content')[0].find_all('a', href = True)[0]['href'].split('-')[0]
    req = urllib.request.Request(url='https://www.discogs.com' + label_url,
                                 data=b'None',
                                 headers={'User-Agent':'Python discogs test'})
    label_html = urllib.request.urlopen(req).read()
    label_soup = BeautifulSoup(label_html, 'html.parser')
    #Find the number of records on the label being sold
    label_for_sale = list(label_soup.find_all("span", itemprop = "offers")[0].stripped_strings)
    if label_for_sale:
        details_dict['number on label for sale'] = int(label_for_sale[0].split(' ')[0].replace(',',''))
    else:
        details_dict['number on label for sale'] = 0
    #Find the number of releases on the label
    on_label = list(label_soup.find_all("strong", class_ = "pagination_total")[0].stripped_strings)
    if on_label:
        details_dict['number on label'] = int(on_label[0].split(' ')[-1].replace(',',''))
    else:
        details_dict['number on label'] = 0    
    
    return details_dict

In [7]:
#Request a page of LPs
#TODO: should change the user-agent to something more specific

#Let's look at the first 500 pages of most wanted jazz LPs
main_url = "https://www.discogs.com/search/?sort=want%2Cdesc&format_exact=LP&genre_exact=Jazz&type=release&page="

#Store all the albums in this dataframe
allalbums_df = pd.DataFrame()

#For all 500 pages
for pagenum in range(0,501):
    req = urllib.request.Request(url=main_url + str(pagenum),
                                 data=b'None',
                                 headers={'User-Agent':'Python discogs test'})
    html = urllib.request.urlopen(req).read()

    #Use BeautifulSoup to pick out all the albums on the page
    albums = BeautifulSoup(html, 'lxml').find_all("a", class_="search_result_title")
    
    #Take the url of each album
    prefix = 'https://www.discogs.com'
    for album in albums:
        album_url = prefix + album['href']
        
        #Wait a second between each request to not swamp the website
        time.sleep(1)

        #Read in the html of an album page
        req = urllib.request.Request(url=album_url,
                                     data=b'None',
                                     headers={'User-Agent':'Python discogs test'})
        html = urllib.request.urlopen(req).read()

        #Pick out the data from the html
        album_data = discogs_to_dict(html)
        album_data['url'] = album_url
        
        #Rearrange the values so all are lists length 1
        for key in album_data.keys():
            album_data[key] = [album_data[key]]        
        #Make into a dataframe and append
        album_df = pd.DataFrame.from_dict(album_data)
        allalbums_df = allalbums_df.append(album_df, ignore_index=True)
        
    #Save to csv
    allalbums_df.to_csv('../data/raw/albums.csv',encoding='utf-8')

In [8]:
allalbums_df.tail()

Unnamed: 0,artist,average rating,compilation,countries of versions,country,format,format details,genre,highest price sold,label,...,number on label,number on label for sale,number want,release,style,title,track names,url,year,years of versions
24995,Intercity Sound Association,4.25,False,Germany; Germany; Germany,Germany,Vinyl,LP; Album; Reissue; Remastered,Jazz; Funk / Soul; Stage & Screen,17.98,Sonorama,...,178,1128,47,SonoramaL-69,Space-Age; Disco; Jazz-Funk; Easy Listening,Phillysound,City Train; Alaska Flight; Proud Horse; Night ...,https://www.discogs.com/Intercity-Sound-Associ...,21 Sep 2012,1976; 2012; 2012
24996,菅野光亮 *,4.64,False,Japan,Japan,Vinyl,LP; Album; Limited Edition; Gatefold,Jazz,53.66,RCA,...,56492,351721,47,"JRS-7262,",Contemporary Jazz,詩仙堂の秋 [Shisendo No Aki],武将 [Busho]; くもの糸 [Kumo No Ito]; 寂光 [Jakko]; 詩仙...,https://www.discogs.com/%E8%8F%85%E9%87%8E%E5%...,17 Dec 2014,1976
24997,Ennio Morricone,4.52,False,Italy; Italy; Germany; Belgium; Italy,Germany,Vinyl,LP; Album; Gatefold,Jazz; Classical; Stage & Screen,17.13,General Music,...,191,1070,47,"87582IU,",Soundtrack; Soul-Jazz; Score; Easy Listening,Mein Name Ist Nobody = Il Mio Nome E' Nessuno ...,Mein Name Ist Nobody = Il Mio Nome E' Nessuno;...,https://www.discogs.com/Ennio-Morricone-Mein-N...,1973,1973; 2015; Unknown; 1973; 2015
24998,Rascal Reporters,4.0,False,US,US,Vinyl,LP; Album,Jazz; Rock,76.65,Hebbardesque Records,...,6,16,47,HR004,Avantgarde; Prog Rock,Ridin' On A Bummer,Elements; The Hills; Rio; Mike Newfield; Barre...,https://www.discogs.com/Rascal-Reporters-Ridin...,1984,2005
24999,Brother Jack McDuff,4.63,False,US; US; UK,US,Vinyl,LP; Album; Mono,Jazz,23.99,Prestige,...,4558,20169,47,PR7422,Soul-Jazz,Hot Barbeque,Hot Barbeque; The Party's Over; Briar Patch; H...,https://www.discogs.com/Brother-Jack-McDuff-Ho...,1966,Unknown; 1966; 1993
