In [1]:
import numpy as np
import pandas as pd
import json
import re

import requests
from bs4 import BeautifulSoup

In [2]:
genres = {}

### Rap

In [3]:
# Ranker (Rap)
url = 'https://www.ranker.com/crowdranked-list/the-greatest-rappers-of-all-time'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

rappers_ranker_1 = []
for artist in soup.findAll('meta', {'itemprop':'name'}):
    if artist['content'] != 'The Greatest Rappers of All Time':
        rappers_ranker_1.append(artist['content'].lower().strip())

In [4]:
# Ranker 2 (Rap)
url = 'https://www.ranker.com/list/best-hip-hop-artists-in-2018/ranker-music'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

rappers_ranker_2 = []
for artist in soup.findAll('meta', {'itemprop':'name'}):
    if artist['content'] != 'The Best Rappers Of 2018, Ranked':
        rappers_ranker_2.append(artist['content'].lower().strip())

In [5]:
genres['rap'] = list(set(sorted(rappers_ranker_1 + rappers_ranker_2)))

### Metal

In [6]:
url = 'https://www.thetoptens.com/top-heavy-metal-bands/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

# Find all commenters
commenters = []
for commenter in soup.findAll('p'):
    commenter_username = commenter.find('b')
    if commenter_username:
        commenters.append(commenter_username.get_text().lower().strip())

# Toptens (Metal)
metal_toptens = []
for artist in soup.findAll('b')[1:]:
    artist_name = artist.get_text().lower().strip()
    if artist_name not in commenters:
        if artist_name[0:3] != '1. ':
            metal_toptens.append(artist_name)

In [7]:
genres['metal'] = list(set(sorted(metal_toptens)))

### Country

In [8]:
# Ranker (Country)
url = 'https://www.ranker.com/list/top-country-artists-of-all-time/samantha-dillinger'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

country_ranker = []
for artist in soup.findAll('meta', {'itemprop':'name'}):
    if artist['content'] != 'The Top Country Artists of All Time':
        country_ranker.append(artist['content'].lower().strip())

In [9]:
# Ranker (Modern country)
url = 'https://www.ranker.com/list/best-new-country-artists/ranker-music'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

modern_country_ranker = []
for artist in soup.findAll('meta', {'itemprop':'name'}):
    if artist['content'] != 'The Best New Country Artists':
        modern_country_ranker.append(artist['content'].lower().strip())

In [10]:
genres['country'] = list(set(sorted(country_ranker + modern_country_ranker)))

### Rock

In [11]:
# Ranker (Rock)
url = 'https://www.ranker.com/crowdranked-list/the-best-rock-bands-of-all-time'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

rock_ranker = []
for artist in soup.findAll('meta', {'itemprop':'name'}):
    if artist['content'] != 'The Best Rock Bands of All Time':
        rock_ranker.append(artist['content'].lower().strip())

In [12]:
# Ranker (Modern Rock)
url = 'https://www.ranker.com/list/modern-rock-bands-and-musicians/reference'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

modern_rock_ranker = []
for artist in soup.findAll('meta', {'itemprop':'name'}):
    if artist['content'] != 'The Best Modern Rock Bands/Artists':
        modern_rock_ranker.append(artist['content'].lower().strip())

In [13]:
# Ranker (Indie Rock)
url = 'https://www.ranker.com/list/indie-bands-and-artists/reference'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

indie_ranker = []
for artist in soup.findAll('meta', {'itemprop':'name'}):
    if artist['content'] != 'The Best Indie Bands & Artists':
        indie_ranker.append(artist['content'].lower().strip())

In [14]:
genres['rock'] = list(set(sorted(rock_ranker + modern_rock_ranker + indie_ranker)))

### Pop

In [15]:
# Billboard (Pop)
url = 'https://www.billboard.com/charts/greatest-of-all-time-pop-songs-artists'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

pop_billboard = []
for artist in soup.findAll('a', {'href':re.compile('/music/')}):
    artist = artist.get_text().replace('\n', '')
    pop_billboard.append(artist.lower().strip())

In [16]:
genres['pop'] = list(set(sorted(pop_billboard)))

### Soul

In [17]:
# Ranker (Soul)
url = 'https://www.ranker.com/list/soul-music-bands-and-musicians/reference'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

soul_ranker = []
for artist in soup.findAll('meta', {'itemprop':'name'}):
    if artist['content'] != 'The Best Soul Singers/Groups of All Time':
        soul_ranker.append(artist['content'].lower().strip())

In [18]:
genres['soul'] = list(set(sorted(soul_ranker)))

### Remove duplicates

In [19]:
# Country & pop (becomes pop)
genres['country'] = sorted([artist for artist in genres['country'] if artist not in genres['pop']])

# Country & rock (becomes rock)
genres['country'] = sorted([artist for artist in genres['country'] if artist not in genres['rock']])

# Metal & rock (becomes metal)
genres['rock'] = sorted([artist for artist in genres['rock'] if artist not in genres['metal']])

# Pop & rock (becomes pop)
genres['rock'] = sorted([artist for artist in genres['rock'] if artist not in genres['pop']])

# Pop & rap (becomes rap)
genres['pop'] = sorted([artist for artist in genres['pop'] if artist not in genres['rap']])

### Convert *genres* dictionary to json

In [20]:
json_genres = json.dumps(genres, indent=4, sort_keys=True)

In [21]:
file = open('.\\data\\json_genres.json', 'w')
file.write(json_genres)
file.close()