In [20]:
from bs4 import BeautifulSoup
import requests as req

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
import sys

import numpy as np
import pandas as pd
import json

from tqdm.notebook import tqdm
import time

scope = "user-library-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))
spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

# Get Names of Chicago Rap Artists ([Every Noise](https://everynoise.com/))

In [21]:
def generate_beautifulSoup(url):
    Web = req.get(url)
    S = BeautifulSoup(Web.text, 'lxml')
    return(S)

def top_n_artists(S, n=10):
    
    # finding the div where all of the names are stored
    canvas = S.find('div', class_="canvas")
    
    artists = []
    for div in canvas:

        
        # the elements in canvas alternate between divs with information and blank tags, 
        # we only want to look at the ones with information
        if (len(div) > 1):

            # finding the font size
            # font size measures artist importance
            style = div.get('style')
            style_attr = style.split('; ')
            importance = '';
            for pair in style_attr:
                attr, val = pair.split(': ')
                if (attr == 'font-size'):
                    importance = val
                    
            # finding the artist name 
            onclick = div.get('onclick')
            artist_string = onclick.split(', ')[1]
            artist = artist_string.split('"')[1]
            
            # appending info
            artists.append((artist, importance))



    # sorting artist by "importance"
    artists = sorted(artists, key=lambda x: x[1], reverse=True)[0:n]
    top_names = [x[0] for x in artists]

    
    
    return(top_names)

In [22]:
# running both functions with chicagorap
S = generate_beautifulSoup('https://everynoise.com/engenremap-chicagorap.html')
artist_names = top_n_artists(S, n=10)
print(artist_names)

['Kanye West', 'Juice WRLD', 'Polo G', 'Lil Durk', 'King Von', 'Chief Keef', 'Jeremih', 'Chance the Rapper', 'G Herbo', 'Lupe Fiasco']


# Get All Tracks from Artists' Albums and their Features' Albums

In [23]:
def get_artists_from_name(artist_names, show_output=False):
    '''
    get artist information from a list of artist names
    '''
    
    artists = []

    print("Name -> Artist:")
    for name in tqdm(artist_names):
        results = spotify.search(q='artist:' + name, type='artist')
        items = results['artists']['items']
        if len(items) > 0:
            artist = items[0]
            artists.append(artist)
            if show_output: print((artist['name'], artist['uri'], artist['genres']))
            
    return artists

def get_albums_from_artist(artists, show_output=False):
    '''
    get all albums from artist without duplicates
    '''

    albums = []
    albums_set = set()
    
    print("Artist -> Album:")
    for artist in tqdm(artists):

        results = spotify.artist_albums(artist['uri'], album_type='album')

        # only add new albums
        for album in results['items']:
            all_artists = [album_artist['name'] for album_artist in album['artists']]
            album_and_artist = f"{album['name']} {' '.join(all_artists)}" # assume static artist order

            if album_and_artist not in albums_set: albums.append(album)
            albums_set.add(album_and_artist)

        # continue loading albums if there are more
        while results['next']:
            results = spotify.next(results)

            for album in results['items']:
                all_artists = [album_artist['name'] for album_artist in album['artists']]
                album_and_artist = f"{album['name']} {' '.join(all_artists)}"

                if album_and_artist not in albums_set: albums.append(album)
                albums_set.add(album_and_artist)
    
    return albums

def get_artists_and_tracks_from_albums(albums):
    '''
    get all songs from a list albums
    '''
    feature_names = []
    tracks = []
    track_set = set()
    
    print("Album -> Artists, Tracks:")
    for album in tqdm(albums):
        # get results
        results = spotify.album_tracks(album['uri'])
        
        # add album release date to each track
        album_info = spotify.album(album['uri'])
        for track in results['items']:
            track['release_date'] = album_info['release_date']
            
        # add all tracks to tracks list
        tracks.extend(results['items'])

        # add artist features to features list
        for info in results['items']:
            feature_names.extend([artist['name'] for artist in info['artists']])

        # continue loading tracks if there are more
        while results['next']:
            results = spotify.next(results)
            tracks.extend(results['items'])

            for info in results['items']:
                feature_names.extend([artist['name'] for artist in info['artists']])

    feature_names = list(set(feature_names))
    return feature_names, tracks

In [24]:
# go from artist names -> artist -> albums -> featured artists/tracks
artists = get_artists_from_name(artist_names)
albums = get_albums_from_artist(artists)
feature_names, tracks = get_artists_and_tracks_from_albums(albums)

Name -> Artist:


  0%|          | 0/10 [00:00<?, ?it/s]

Artist -> Album:


  0%|          | 0/10 [00:00<?, ?it/s]

Album -> Artists, Tracks:


  0%|          | 0/148 [00:00<?, ?it/s]

In [25]:
# small summary
print(f"Number of Albums: {len(albums)}")
print(f"Sample: {[albums[i*len(albums)//10]['name'] for i in range(10)]}")
    
print(f'\nNumber of Featured Artists: {len(feature_names)}')
print(f"Sample: {[feature_names[i*len(feature_names)//10] for i in range(10)]}")

print(f"\nNumber of tracks: {len(tracks)}")
print(f"Sample: {[tracks[i*len(tracks)//10]['name'] for i in range(10)]}")

Number of Albums: 148
Sample: ['Donda (Deluxe)', 'The College Dropout', 'The Voice', 'Remember My Name', 'The GloFiles (Pt. 3)', 'The GloFiles (Pt. 2)', 'Big Gucci Sosa', 'Sorry 4 The Weight (Deluxe Edition)', 'Jeremih', 'Welcome to Fazoland']

Number of Featured Artists: 304
Sample: ['Kirk Franklin', 'DIXSON', 'Rick Ross', 'Bon Iver', 'Trippie Redd', 'J. Cole', 'KIDS SEE GHOSTS', 'Francis and the Lights', 'Hypno Carlito', 'Lil 40']

Number of tracks: 2310
Sample: ['Donda Chant', 'Lil Jimmy Skit', 'No Interviews', 'Check', 'NASCAR', 'Almoney', "Don't Lose no Load", 'Ten Toes Down', 'Jumpin', 'Love 2 Stunt']


In [26]:
# second round of pulling information
more_artists = get_artists_from_name(feature_names)
more_albums = get_albums_from_artist(more_artists)
more_feature_names, more_tracks = get_artists_and_tracks_from_albums(more_albums)

Name -> Artist:


  0%|          | 0/304 [00:00<?, ?it/s]

Artist -> Album:


  0%|          | 0/304 [00:00<?, ?it/s]

Album -> Artists, Tracks:


  0%|          | 0/2861 [00:00<?, ?it/s]

In [27]:
# small summary for finalized list
print(f"Number of Albums: {len(more_albums)}")
print(f"Sample: {[more_albums[i*len(more_albums)//10]['name'] for i in range(10)]}")
    
print(f'\nNumber of Featured Artists: {len(more_feature_names)}')
print(f"Sample: {[more_feature_names[i*len(more_feature_names)//10] for i in range(10)]}")

print(f"\nNumber of tracks: {len(more_tracks)}")
print(f"Sample: {[more_tracks[i*len(more_tracks)//10]['name'] for i in range(10)]}")

Number of Albums: 2861
Sample: ['LONG LIVE LOVE', 'Slime Season 4', 'Life I Live', 'Big Gucci Sosa', 'Eastside Piru', '2014 Forest Hills Drive', 'Me Against the World', 'Math Class 2', 'The Randy Newman Songbook, Vol. 3', 'A Legendary Christmas']

Number of Featured Artists: 4645
Sample: ['Frank White', 'Ryan Adams', 'T-Moe', 'Game', 'Young Quis', 'Cootie', 'M.T.P.', 'T@', 'Camouflage', 'CHOC']

Number of tracks: 45040
Sample: ['F.A.V.O.R', 'My Boys (feat. Ralo, Trouble and Lil Durk)', 'Ticket (feat. Coach Rick)', 'According to My Watch', 'Scared of the Dark (feat. Riff Raff)', 'Joy to the World', 'Phenomonon', 'Brotha - The Inferno Mix', 'We Are The Hustlaz', 'New Hunnids (feat. Yung Bans & Gunna)']


In [28]:
# final pull of artist info
final_artists = get_artists_from_name(more_feature_names)

Name -> Artist:


  0%|          | 0/4645 [00:00<?, ?it/s]

In [29]:
# save each artist as json
for i, artist in tqdm(enumerate(final_artists)):
    with open(f"artists/artist{i}.json", "w") as outfile:
        json.dump(artist, outfile)    

0it [00:00, ?it/s]

In [30]:
# save each track as json
for i, track in tqdm(enumerate(more_tracks)):
    with open(f"tracks/track{i}.json", "w") as outfile:
        json.dump(track, outfile)

0it [00:00, ?it/s]