In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pickle
import re
import psycopg2 as pg
from sqlalchemy import create_engine

# In this notebook I grab all of the artists I will be using in my recommender app

I gabbed them form different lists from [ranker.com](https://www.ranker.com/), a crowd sourced rankings site.

# Grab all rap artists

[Rankers best rappers of all time](https://www.ranker.com/crowdranked-list/the-greatest-rappers-of-all-time)

In [2]:
rap_url = "https://www.ranker.com/crowdranked-list/the-greatest-rappers-of-all-time"

In [3]:
response = requests.get(rap_url)
response.status_code

200

In [4]:
page = response.text
type(page)

str

We are using beautiful soup to scrape a list of rappers

In [5]:
soup = BeautifulSoup(page, "lxml")

In [6]:
soup.find_all("li", {"class": "gridItem_main__3gWq0"});

Looks like this is a dynamic page and we will need Selenium

In [7]:
# make a function for scraping Ranker.com pages
def get_ranker_list_as_dict(url):
    """
    input the url for a Ranker page
    outputs a list of dictionaries. One dictionary for each artist
    """
    page_list=[]
    # set the path and the driver 
    # comment it out because we dont want this cell running again
    PATH="/Applications/chromedriver"
    driver = webdriver.Chrome(PATH)

    # have the driver grab the url
    driver.get(url)
    # let it sleep for page to load
    time.sleep(3)
    more_left = True
    while more_left:
        # while there are more "hidden items" keep clicking load more button
        try:
            driver.find_element_by_link_text("LOAD MORE").click()
        except: 
            more_left = False
            
        # let it sleep to give it time to load next section
        time.sleep(1)

    # grab the page source. from the driver
    html = driver.page_source
    # pass the page sourse to beautiful soup
    soup = BeautifulSoup(html)

    # now we loop through the page with beautiful soup
    artist_rows = soup.find_all("li", {"class": "gridItem_main__3gWq0"});
    for i, artist in enumerate(artist_rows):
        # grab the artist name from the name element
        name_holder = artist.find("meta", itemprop="name")
        name=name_holder['content']

        # grab the artist image url
        picture_holder = artist.find("img")
        try:
            picture_url = picture_holder['src']
        except:
            picture_url = picture_holder['data-src']

        # make a dict for the artists and append to the page list
        artist_dict = {"name": name, "picture_url": picture_url, "ranker_ranking": i}
        page_list.append(artist_dict)

    driver.close()
    return(page_list)

### this is the function we run to grab all the rap artists

In [8]:
# rapper_list = get_ranker_list_as_dict(rap_url)

In [9]:
# pickle the list of data
# comment out so we dont re run
"""
with open('../artist_data/rapper_list.pkl', 'wb') as f:
    pickle.dump(rapper_list, f)
""";



In [10]:
with open('../artist_data/rapper_list.pkl', 'rb') as f:
    rapper_list = pickle.load(f)

In [11]:
len(rapper_list)

223

# Get All Rap Groups

[Rankers Best Grap Groups of all Time](https://www.ranker.com/crowdranked-list/overall-best-hip-hop-crew)

In [42]:
rap_group_url = "https://www.ranker.com/crowdranked-list/overall-best-hip-hop-crew"
# rap_group_list = get_ranker_list_as_dict(rap_group_url)

In [43]:
# pickle the list of data
# comment out so we dont re run
"""
with open('../rap_group_list.pkl', 'wb') as f:
    pickle.dump(rap_group_list, f)
""";



In [12]:
with open('../artist_data/rap_group_list.pkl', 'rb') as f:
    rap_group_list = pickle.load(f)

In [45]:
len(rap_group_list)

123

# Grab Country Artists

[Ranker best country singers of all time](https://www.ranker.com/list/top-country-artists-of-all-time/samantha-dillinger)

In [87]:
country_url = "https://www.ranker.com/list/top-country-artists-of-all-time/samantha-dillinger"

In [88]:
country_singer_list=get_ranker_list_as_dict(country_url)

In [92]:
# pickle the list of data
# comment out so we dont re run
"""
with open('../artist_data/country_singer_list.pkl', 'wb') as f:
    pickle.dump(country_singer_list, f)

""";

In [13]:
with open('../artist_data/country_singer_list.pkl', 'rb') as f:
    country_singer_list = pickle.load(f)

In [91]:
len(country_singer_list)

291

# Grab Classic Rock 
[Ranker Top Classic Rock Bands](https://www.ranker.com/crowdranked-list/greatest-classic-rock-bands)

In [102]:
classic_rock_url = "https://www.ranker.com/crowdranked-list/greatest-classic-rock-bands"

In [103]:
classic_rock_list=get_ranker_list_as_dict(classic_rock_url)

In [107]:
# pickle the list of data
# comment out so we dont re run
"""
with open('../artist_data/classic_rock_list.pkl', 'wb') as f:
    pickle.dump(classic_rock_list, f)
""";

In [14]:
with open('../artist_data/classic_rock_list.pkl', 'rb') as f:
    classic_rock_list = pickle.load(f)

In [108]:
len(classic_rock_list)

263

# Grab Blues 
[Rankers Blues Pages](https://www.ranker.com/crowdranked-list/the-best-blues-artists-of-all-time?ref=collections_page)

In [56]:
blues_url = "https://www.ranker.com/crowdranked-list/the-best-blues-artists-of-all-time?ref=collections_page"

In [57]:
#blues_list=get_ranker_list_as_dict(blues_url)

In [58]:
# pickle the list of data
# comment out so we dont re run
"""
with open('../artist_data/blues_list.pkl', 'wb') as f:
    pickle.dump(blues_list, f)
""";

In [15]:
with open('../artist_data/blues_list.pkl', 'rb') as f:
    blues_list = pickle.load(f)

In [62]:
len(blues_list)

225

# Grab Alternative
[Rankers Alternative Artists](https://www.ranker.com/list/alternative-bands-and-artists/reference?ref=collections_page)

In [63]:
alternative_url = "https://www.ranker.com/list/alternative-bands-and-artists/reference?ref=collections_page"

In [64]:
#alternative_list=get_ranker_list_as_dict(alternative_url)

In [65]:
# pickle the list of data
# comment out so we dont re run
"""
with open('../artist_data/alternative_list.pkl', 'wb') as f:
    pickle.dump(alternative_list, f)
""";

In [16]:
with open('../artist_data/alternative_list.pkl', 'rb') as f:
    alternative_list = pickle.load(f)

In [67]:
len(alternative_list)

421

# Check for genre overlap and combine genres

In [109]:
artist_dict_list = [
("Rap/Hip-Hop", rapper_list),
("Rap/Hip-Hop", rap_group_list),
("Country", country_singer_list),
("Blues", blues_list),
("Alternative", alternative_list),
("Classic Rock", classic_rock_list)]

In [54]:
artist_genre_dictionary = {}
for genre, artist_list in artist_dict_list:
    for artist in artist_list:
        name = artist["name"]
        if name in artist_genre_dictionary.keys():
            print("adding genre for {}".format(name))
            new_genre = artist_genre_dictionary[name] + ", " + genre
            print("new genre: {}".format(new_genre))
            artist_genre_dictionary[name] = new_genre
        else:
            artist_genre_dictionary[name] = genre

adding genre for Ray Charles
new genre: Country, Blues
adding genre for Johnny Cash
new genre: Country, Blues
adding genre for Elvis Presley
new genre: Country, Blues
adding genre for Beastie Boys
new genre: Rap/Hip-Hop, Alternative
adding genre for Gorillaz
new genre: Rap/Hip-Hop, Alternative
adding genre for The Black Keys
new genre: Blues, Alternative
adding genre for Jack White
new genre: Blues, Alternative
adding genre for Amy Winehouse
new genre: Blues, Alternative
adding genre for The Prodigy
new genre: Rap/Hip-Hop, Alternative
adding genre for The Allman Brothers Band
new genre: Blues, Alternative
adding genre for Ryan Adams
new genre: Country, Alternative
adding genre for Old 97's
new genre: Country, Alternative
adding genre for Led Zeppelin
new genre: Alternative, Classic Rock
adding genre for Pink Floyd
new genre: Alternative, Classic Rock
adding genre for The Jimi Hendrix Experience
new genre: Alternative, Classic Rock
adding genre for Eagles
new genre: Country, Classic Roc

In [55]:
len(artist_genre_dictionary.keys())

1486

In [56]:
"""
with open('../artist_data/artist_genre_dictionary.pkl', 'wb') as f:
    pickle.dump(artist_genre_dictionary, f)
""";

In [17]:
with open('../artist_data/artist_genre_dictionary.pkl', 'rb') as f:
    artist_genre_dictionary = pickle.load(f)

# Create DataFrame

To put the data in my database Im going to a make a dataframe and put that in
create a dataframe using:  artist_name, artist_genre artist_picture_ranker_url ranker_ranking

I commeneted this out because I saved the data frame and dont ant to accidentally overrite it

In [72]:
# create the df
"""
artist_df = pd.DataFrame(list(artist_genre_dictionary.items()), columns =['artist_name', 'genre']) 
artist_df.head()
"""

In [74]:
# we want to add picture and ranking 

In [2]:
# initialize with nans
"""
artist_df["artist_picture_ranker_url"] = np.nan
artist_df["artist_picture_genius_url"] = np.nan
artist_df["ranker_ranking"] = np.nan
""";

In [3]:
"""
# loop through out artist dicts and add the info to the dataframe
for genre_list in artist_dict_list: 
    for artist in genre_list[1]:
        name = artist["name"]
        image = artist["picture_url"]
        ranking = artist["ranker_ranking"]
        artist_df.loc[artist_df["artist_name"] == name, "artist_picture_ranker_url"] = image
        artist_df.loc[artist_df["artist_name"] == name, "ranker_ranking"] = ranking
""";

In [111]:
artist_df

Unnamed: 0,artist_name,genre,artist_picture_ranker_url,artist_picture_genius_url,ranker_ranking
0,Tupac,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/115/228...,,0.0
1,The Notorious B.I.G.,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/111/221...,,1.0
2,Eminem,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/47/9378...,,2.0
3,Kendrick Lamar,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/3107/62...,,3.0
4,Dr. Dre,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/45/8916...,,4.0
...,...,...,...,...,...
1481,Gentle Giant,Classic Rock,https://imgix.ranker.com/node_img/53/1053647/o...,,258.0
1482,Camel,Classic Rock,https://imgix.ranker.com/user_node_img/125/249...,,259.0
1483,Strawberry Alarm Clock,Classic Rock,https://imgix.ranker.com/node_img/106/2104252/...,,260.0
1484,MC5,Classic Rock,https://imgix.ranker.com/user_node_img/78/1550...,,261.0


In [112]:
with open('../artist_data/artist_dataframe.pkl', 'rb') as f:
    artist_df = pickle.load(f)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1486 entries, 0 to 1485
Data columns (total 5 columns):
artist_name                  1486 non-null object
genre                        1486 non-null object
artist_picture_ranker_url    1486 non-null object
artist_picture_genius_url    0 non-null float64
ranker_ranking               1486 non-null float64
dtypes: float64(2), object(3)
memory usage: 58.2+ KB


In [118]:
len(artist_df["artist_name"].unique()) == artist_df.shape[0]

True

Every column is full and there are no duplicate artists, thats good. Let move to a database

In [119]:
"""
with open('../artist_data/artist_dataframe.pkl', 'wb') as f:
    pickle.dump(artist_df, f)
"""

# Create Database

### Put the dataframe into a table in my databse

In [18]:
with open('../artist_data/artist_dataframe.pkl', 'rb') as f:
    artist_df = pickle.load(f)

In [20]:
# Postgres info to connect
connection_args = {
    'host': 'localhost',  
    'dbname': 'lyricsdb',  
}

connection = pg.connect(**connection_args)

In [128]:
cursor = connection.cursor()

In [142]:
"""
engine = create_engine('postgresql://localhost:5432/lyricsdb')
artist_df.to_sql('Artist', engine)
""";

In [27]:
query = 'SELECT * FROM artists;'

target_df = pd.read_sql(query, connection)

In [28]:
target_df

Unnamed: 0,index,artist_name,genre,artist_picture_ranker_url,artist_picture_genius_url,ranker_ranking
0,0,Tupac,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/115/228...,,0.0
1,1,The Notorious B.I.G.,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/111/221...,,1.0
2,2,Eminem,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/47/9378...,,2.0
3,3,Kendrick Lamar,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/3107/62...,,3.0
4,4,Dr. Dre,Rap/Hip-Hop,https://imgix.ranker.com/user_node_img/45/8916...,,4.0
...,...,...,...,...,...,...
1481,1481,Gentle Giant,Classic Rock,https://imgix.ranker.com/node_img/53/1053647/o...,,258.0
1482,1482,Camel,Classic Rock,https://imgix.ranker.com/user_node_img/125/249...,,259.0
1483,1483,Strawberry Alarm Clock,Classic Rock,https://imgix.ranker.com/node_img/106/2104252/...,,260.0
1484,1484,MC5,Classic Rock,https://imgix.ranker.com/user_node_img/78/1550...,,261.0


# it works!!!