In [1]:
# load environmental variables
from dotenv import load_dotenv
load_dotenv()

True

# Scrape Mississippi Studios calendar

In [2]:
# load libraries
import os
import spotipy
import spotipy.util as util

import pickle

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

import datetime

In [3]:
# scrape calendar page for mississippi
site = "https://www.mississippistudios.com/calendar/"
# no headers
response = requests.get(site)
content = response.content

## Parse the scraped website using BeautifulSoup4

In [4]:
soup = BeautifulSoup(content, 'lxml')

### BROKEN: Save data as a text file
It seems that BeautifulSoup objects cannot be pickled...

Will need to save html from the initial URL request if we want to do this, or alternatively just output the dataframe.

In [None]:
# give today's date to file name
todays_date = datetime.date.today()
filename = str(todays_date) + '_mississippi_studios_cal.txt'
filename

In [None]:
# write data to file
#filename = '2020-01-22_mississippi_studios_cal.txt'
f = open( filename, 'w' )
f.write(soup.prettify())
f.close()

In [None]:
# load a file
filename = '2020-02-18_mississippi_studios_cal.txt'
with open(filename) as f:
    soup = BeautifulSoup(f.read(), 'lxml')

## Parse the soup

In [5]:
days = soup.find_all('td')

dates = []
headliners = []
openers = []
venues = []

for day in days:
    
    events = day.find_all(class_='one-event')
    if events:
            date = day.section.span['title']

    for event in events:
        # record a date
        dates.append(date)
        
        # record venue
        venue = event.find(class_='venue').text
        venues.append(venue)
        
        # get headliner and add to list
        heads = event.find(class_='headliners').text
        headliners.append(heads)
        
        # get openers and add to list 
        supps = event.find_all(class_='supports')
        
        opener_list = []
        for supp in supps:
            opener_list.append(supp.text)

        # format multiple openers to match multiple headliners
        # (to allow vectorized string methods later on)
        opener_string = ' / '.join(opener_list)
        
        openers.append(opener_string)

In [72]:
d = {'headliners': headliners, 'openers': openers, 'date': dates, 'venue': venues}
all_shows = pd.DataFrame(data = d)
all_shows

Unnamed: 0,headliners,openers,date,venue
0,Trout Steak Revival,Left Coast Country,2020-02-19T00:00:00-08:00,Mississippi Studios
1,Dustbowl Revival,Jared & The Mill,2020-02-20T00:00:00-08:00,Mississippi Studios
2,Mike and the Moonpies / Quaker City Night Hawks,,2020-02-21T00:00:00-08:00,Polaris Hall
3,Sexy Pants,Internet Beef / Rap Class,2020-02-21T00:00:00-08:00,Mississippi Studios
4,Dirty Honey - Rolling 7s Tour,The Amazons,2020-02-22T00:00:00-08:00,Polaris Hall
...,...,...,...,...
106,The BellRays / Slim Cessna's Auto Club,,2020-06-18T00:00:00-07:00,Mississippi Studios
107,Bing & Ruth,,2020-06-19T00:00:00-07:00,Mississippi Studios
108,Polaris (from The Adventures of Pete & Pete),,2020-07-08T00:00:00-07:00,Mississippi Studios
109,Archers Of Loaf,,2020-07-09T00:00:00-07:00,Mississippi Studios


## Pickling the `all_shows` dataframe

In [14]:
today = datetime.date.today()
today = today.strftime('%m-%d-%Y')
pickle_name = today + '_all_shows.pickle'
pickle_name

'02-19-2020_all_shows.pickle'

In [15]:
# pickle the dataframe
all_shows.to_pickle(pickle_name)

In [None]:
# load the pickled dataframe
# all_shows = pd.read_pickle('02-19-2020_all_shows.pickle')

## Remove cancelled shows and shows moved to other venues

In [74]:
# convert all strings to lowercase
all_shows['headliners'] = all_shows['headliners'].str.lower()
all_shows['openers'] = all_shows['openers'].str.lower()
all_shows['venue'] = all_shows['venue'].str.lower()

In [75]:
# remove cancelled shows
shows = all_shows[~all_shows['headliners'].str.contains('cancelled')]

In [76]:
# remove shows that have been moved to other venues
shows = shows[~shows['headliners'].str.contains('moved to')]

In [77]:
shows.head()

Unnamed: 0,headliners,openers,date,venue
0,trout steak revival,left coast country,2020-02-19T00:00:00-08:00,mississippi studios
1,dustbowl revival,jared & the mill,2020-02-20T00:00:00-08:00,mississippi studios
2,mike and the moonpies / quaker city night hawks,,2020-02-21T00:00:00-08:00,polaris hall
3,sexy pants,internet beef / rap class,2020-02-21T00:00:00-08:00,mississippi studios
4,dirty honey - rolling 7s tour,the amazons,2020-02-22T00:00:00-08:00,polaris hall


## Combine headliners and openers into a single string

I decided to do this so that I use vectorized string methods on **all** artist listings **at once**. To keep consistent with the Mississippi calendar scheme, I'll use a ` / ` as a seperator.

Alternatively, I could've written my cleaning functions then applied them to a list of `[shows['headliners'], shows['openers']]`.

In [78]:
# create full event lineup as a single string
marquee = []
for index, row in shows.iterrows():
    # if no openers, billing is just headliner(s)
    if not row['openers']:  
        performers = row['headliners']
    # if openers, billing is headliner(s) + opener(s)
    else:
        performers = row['headliners'] + " / " + row['openers']
    
    marquee.append(performers)

In [79]:
# save combined lineup to shows dataframe as a new column 'marquee'
shows['marquee'] = marquee
shows

Unnamed: 0,headliners,openers,date,venue,marquee
0,trout steak revival,left coast country,2020-02-19T00:00:00-08:00,mississippi studios,trout steak revival / left coast country
1,dustbowl revival,jared & the mill,2020-02-20T00:00:00-08:00,mississippi studios,dustbowl revival / jared & the mill
2,mike and the moonpies / quaker city night hawks,,2020-02-21T00:00:00-08:00,polaris hall,mike and the moonpies / quaker city night hawks
3,sexy pants,internet beef / rap class,2020-02-21T00:00:00-08:00,mississippi studios,sexy pants / internet beef / rap class
4,dirty honey - rolling 7s tour,the amazons,2020-02-22T00:00:00-08:00,polaris hall,dirty honey - rolling 7s tour / the amazons
...,...,...,...,...,...
106,the bellrays / slim cessna's auto club,,2020-06-18T00:00:00-07:00,mississippi studios,the bellrays / slim cessna's auto club
107,bing & ruth,,2020-06-19T00:00:00-07:00,mississippi studios,bing & ruth
108,polaris (from the adventures of pete & pete),,2020-07-08T00:00:00-07:00,mississippi studios,polaris (from the adventures of pete & pete)
109,archers of loaf,,2020-07-09T00:00:00-07:00,mississippi studios,archers of loaf


# Cleaning strings

- remove 'an evening with' and 'an evening of'
- remove 'sold out: '
- remove 'solo' and '(solo)'

In [92]:
# playing with the pattern
# unclear why adding a '\b' after 'evening' results in 0 hits
pattern = r"\ban evening (?:of|with) "
shows[shows['marquee'].str.contains(pattern)]['marquee']

28    sold out: jason lytle: an evening of acoustic ...
29    jason lytle: an evening of acoustic piano and ...
50      an evening with caitlin canty with noam pikelny
61                 an evening with steep canyon rangers
73                           an evening with dan mangan
Name: marquee, dtype: object

In [93]:
# remove "an evening of/with " from artist strings
pattern = r"\ban evening (?:of|with) "
shows['marquee'] = [re.sub(pattern, "", each) for each in shows['marquee']]

In [98]:
# remove "sold out: " from artist strings
shows['marquee'] = shows['marquee'].str.replace('sold out: ','')

In [95]:
# remove instances of 'solo' and '(solo)' from artist strings
pattern = r"\b[(]?solo[)]?\b"
shows['marquee'] = [re.sub(pattern, "", each) for each in shows['marquee']]

In [102]:
# remove 'unplugged' from artist strings
pattern = r"\bunplugged\b"
shows['marquee'] = [re.sub(pattern, "", each) for each in shows['marquee']]

## Separating by venue

In [10]:
shows['venue'].value_counts()

mississippi studios    81
polaris hall           26
Name: venue, dtype: int64

There are 81 events at Mississippi Studios, and 26 at Polaris Hall.

In [24]:
# extract events at Mississippi
mississippi_shows = shows[shows['venue'].str.contains('mississippi')].copy()
mississippi_shows

Unnamed: 0,headliners,openers,date,venue
0,trout steak revival,left coast country,2020-02-19T00:00:00-08:00,mississippi studios
1,dustbowl revival,jared & the mill,2020-02-20T00:00:00-08:00,mississippi studios
3,sexy pants,internet beef / rap class,2020-02-21T00:00:00-08:00,mississippi studios
5,dave hause & the mermaid,northcote,2020-02-22T00:00:00-08:00,mississippi studios
6,kandace springs with special guest jimmie herrod,,2020-02-23T00:00:00-08:00,mississippi studios
...,...,...,...,...
106,the bellrays / slim cessna's auto club,,2020-06-18T00:00:00-07:00,mississippi studios
107,bing & ruth,,2020-06-19T00:00:00-07:00,mississippi studios
108,polaris (from the adventures of pete & pete),,2020-07-08T00:00:00-07:00,mississippi studios
109,archers of loaf,,2020-07-09T00:00:00-07:00,mississippi studios


In [25]:
# extract events at Polaris Hall
polaris_shows = shows[shows['venue'].str.contains('polaris')].copy()
polaris_shows

Unnamed: 0,headliners,openers,date,venue
2,mike and the moonpies / quaker city night hawks,,2020-02-21T00:00:00-08:00,polaris hall
4,dirty honey - rolling 7s tour,the amazons,2020-02-22T00:00:00-08:00,polaris hall
13,reptaliens / cones,,2020-03-01T00:00:00-08:00,polaris hall
19,michaela anne,barna howard,2020-03-06T00:00:00-08:00,polaris hall
21,the shivas,máscaras / bad shadows,2020-03-07T00:00:00-08:00,polaris hall
34,jason boland and the stragglers,elaina kay,2020-03-19T00:00:00-07:00,polaris hall
36,old salt union,left coast country,2020-03-20T00:00:00-07:00,polaris hall
38,2020 xray awards,,2020-03-21T00:00:00-07:00,polaris hall
39,2020 xray after party,,2020-03-21T00:00:00-07:00,polaris hall
46,sonny and the sunsets,michael hurley / with special guests the gonks,2020-03-26T00:00:00-07:00,polaris hall


### Creating datetime objects

In [None]:
# convert dates to datetime objects
shows['date'] = [datetime.datetime.fromisoformat(date) for date in shows['date']]

In [None]:
shows

Note: A `timedelta` object represents a duration, the difference between two dates or times.

In [None]:
# convert datetime objects to date objects
shows['date'] = [each.date() for each in shows['date']]
shows

In [None]:
# get today's date as a date object
today_date = datetime.date.today()
# test as if today was March 10 so that "past" events can be filtered out

In [None]:
# calculate how many days from today's date the event is
# convert timedeltas to ints
# create mask to remove shows in the past
days_until_event = [(each - today_date).days for each in date_object]
past_events_mask = [each >= 0 for each in days_until_event]
shows[past_events_mask]

### Sold out shows
Artists playing sold out shows do not need to be removed, as tickets may still be obtainable from third parties.

The string 'sold out', however, should be removed from the string.

In [None]:
# flagging sold out shows
sold_out_shows = shows[shows['headliners'].str.contains('sold out')]
sold_out_shows

In [26]:
sold_out_shows = mississippi_shows[mississippi_shows['marquee'].str.contains('sold out')]
sold_out_shows

KeyError: 'marquee'

It appears that "sold out" is always
- at the beginning of the headline
- followed by ": "
Thus, we can remove the "sold out: " part of the string.

In [16]:
# remove "sold out: " from headliner strings
# shows['headliners'] = shows['headliners'].str.replace('sold out: ','')
mississippi_shows['marquee'] = mississippi_shows['marquee'].str.replace('sold out: ','')

### String cleaning: "an evening with/of"

In [None]:
# use regex to replace evening
evenings = shows[shows['headliners'].str.contains('an evening ')]
evenings

In [17]:
evenings = mississippi_shows[mississippi_shows['marquee'].str.contains('an evening ')]
evenings

Unnamed: 0,headliners,openers,date,venue,marquee
28,sold out: jason lytle: an evening of acoustic ...,,2020-03-13T00:00:00-07:00,mississippi studios,jason lytle: an evening of acoustic piano and ...
29,jason lytle: an evening of acoustic piano and ...,,2020-03-13T00:00:00-07:00,mississippi studios,jason lytle: an evening of acoustic piano and ...
61,an evening with steep canyon rangers,,2020-04-03T00:00:00-07:00,mississippi studios,an evening with steep canyon rangers
73,an evening with dan mangan,,2020-04-14T00:00:00-07:00,mississippi studios,an evening with dan mangan


Note that in the code block below, the `?:` indicates a non-capturing group:

In [None]:
pattern = r"\ban evening (?:of|with) "
shows[shows['headliners'].str.contains(pattern, flags=re.I)]

In [None]:
# remove "an evening of/with " from headliner strings
pattern = r"\ban evening (?:of|with) "
shows['headliners'] = [re.sub(pattern, "", each) for each in shows['headliners']]

In [18]:
# remove "an evening of/with " from headliner strings
pattern = r"\ban evening (?:of|with) "
mississippi_shows['headliners'] = [re.sub(pattern, "", each) for each in mississippi_shows['headliners']]

In [None]:
# lines containing the string 'an evening'
shows[shows['headliners'].str.contains('an evening')]

In [None]:
shows[shows['headliners'].str.contains(':')]['headliners']

In [None]:
shows[shows['headliners'].str.contains(' - ')]['headliners']

### Non-musical events
The page for Raphael Saadiq says: "This event is not a musical performance" on the click-to page, but this is not the case for every non-musical event (e.g. "the comedians following tool on tour: the tour...".

## Problem strings and their natures:

### Special characters and strings
- ` - `
    - bookended by spaces
- `&`
- `:`
    - seems that this always follows 'sold out'
    - not always artist: qualifyer (as in `jason lytle: an evening of acoustic`); sometimes it's the opposite, as in `cameron esposito: save yourself tour`
- `feat.`
- `featuring`
- `tour`
- `moved to...`
- `sold out`
    - always followed by a ':'
- `an evening with/of`
- `early show`, `late show`


- `dirty honey - rolling 7s tour`: the artist is 'dirty honey' and the `- rolling 7s tour` refers to the tour name
    - possible solution: watch to see if '-' characters indicate "extra" info about the event that can be removed for passing a string to the Spotify artist search
    - try splitting string before and after the special character, then searching both and seeing what hits better
    - look for reference to the string "tour" and do something like:
    ```
    if artist.contains('tour'):
        skip
    ```
- `dave hause & the mermaid`: the artist is `dave hause`, the mermaid is extra and does not help search function
    - same as above, but for '&' characters
- `casey neill & the norway rats`: the artist in this case is the full string, including the `&`
- `the mysti krewe of nimbus' mardi gras party feat. too loose cajun`: no results for any way of breaking this up
- `zydeco band`
- `moved to aladdin theater:`


# Spotify

In [108]:
# set API authorization vars and scope
client_id = os.environ.get('SPOTIPY_CLIENT_ID')
client_secret = os.environ.get('SPOTIPY_CLIENT_SECRET')
scope = 'user-library-read playlist-modify-private playlist-modify-public playlist-read-private'
username = '1237403078'

In [114]:
# authenticate
token = util.prompt_for_user_token(
        username=username,
        scope=scope,
        client_id=client_id,
        client_secret=client_secret,
        redirect_uri='http://localhost/')


sp = spotipy.Spotify(auth=token)

In [None]:
# create new playlist
#results = sp.user_playlist_create(username, "Mississippi")

In [None]:
pl_uri = results['uri']

In [None]:
pl_uri

In [21]:
# initialize list of tracks to be added to playlist
add_me = []
# copy headliners series into new variable for ease of typing
hl = shows['headliners'].copy()

In [30]:
miss_marquee = mississippi_shows['marquee']

In [None]:
# split double headliners indicated by " / "
hl = hl.str.split(' / ')
hl

In [31]:
miss_marquee = miss_marque.str.split(' / ')
miss_marquee

0              [trout steak revival, left coast country]
1                   [dustbowl revival, jared & the mill]
3                 [sexy pants, internet beef, rap class]
5                  [dave hause & the mermaid, northcote]
6      [kandace springs with special guest jimmie her...
                             ...                        
102              [the bellrays, slim cessna's auto club]
103                                        [bing & ruth]
104       [polaris (from the adventures of pete & pete)]
105                                    [archers of loaf]
106                                    [archers of loaf]
Name: marquee, Length: 81, dtype: object

In [None]:
type(hl)

In [None]:
# flatten list of lists into single list
hl_flat = [item for sublist in hl for item in sublist]
# convert to Series
hl_series = pd.Series(hl_flat)
hl_series

In [32]:
# flatten list of lists into single list
miss_marquee_flat = [item for sublist in miss_marquee for item in sublist]
# convert to Series
miss_series = pd.Series(miss_marquee_flat)
miss_series

0                               trout steak revival
1                                left coast country
2                                  dustbowl revival
3                                  jared & the mill
4                                        sexy pants
                           ...                     
142                         slim cessna's auto club
143                                     bing & ruth
144    polaris (from the adventures of pete & pete)
145                                 archers of loaf
146                                 archers of loaf
Length: 147, dtype: object

Note: I could just leave duplicates in and filter this later based on Spotify's search results. Might not be a bad idea to double check for duplicate artist URI's once all acquired.

In [None]:
# remove various notations of "early show" and "late show" (which escape duplicates)
pattern = r"\s?-?\s?[(]?(?:early|late) show[)]?"
double_shows = hl_series[hl_series.str.contains(pattern)]
print(double_shows)
hl_series = hl_series.str.replace(pattern, '')

In [33]:
# remove various notations of "early show" and "late show" (which escape duplicates)
pattern = r"\s?-?\s?[(]?(?:early|late) show[)]?"
double_shows = miss_series[miss_series.str.contains(pattern)]
miss_series = miss_series.str.replace(pattern, '')

47     jason lytle: an evening of acoustic piano and ...
48     jason lytle: an evening of acoustic piano and ...
133                         phoebe robinson - early show
134                          phoebe robinson - late show
dtype: object


In [None]:
# remove duplicates from list
hl_no_dups = [] 
[hl_no_dups.append(artist) for artist in hl_series if artist not in hl_no_dups] 
hl_no_dups

In [35]:
# remove duplicates from list
miss_no_dups = [] 
[miss_no_dups.append(artist) for artist in miss_series if artist not in miss_no_dups] 
miss_no_dups

['trout steak revival',
 'left coast country',
 'dustbowl revival',
 'jared & the mill',
 'sexy pants',
 'internet beef',
 'rap class',
 'dave hause & the mermaid',
 'northcote',
 'kandace springs with special guest jimmie herrod',
 "the mysti krewe of nimbus' mardi gras party feat. too loose cajun",
 'zydeco band',
 'a conversation with raphael saadiq',
 'siren and the sea',
 'blossom',
 'snugsworth',
 'slow hollows',
 'fox academy',
 'yacht rock revue unplugged',
 'eyelids',
 "talkin' to johnny",
 'zebra hunt',
 'carsie blanton',
 'jamie drake',
 'hikes',
 'childspeak',
 'lower dens',
 'ami dang',
 'habibi',
 'rudy de anda',
 'shopping',
 'automatic',
 'mope grooves',
 'blackwater holylight',
 'abronia',
 'night swim',
 'the ballroom thieves',
 'harlequin gold',
 'christopher paul stelling',
 'jacob miller',
 'the comedians following tool on tour: the tour featuring rory scovel and nick youssef',
 'advance base',
 'claire cronin',
 'ruth gabrus',
 'casey neill & the norway rats',
 'r

## Searching for Spotify artists, first pass

**The for loop**

In this block of code, I loop through the strings of the `hl_no_dups` list, passing each to the `sp.search` function and passing in `artist` for the `type` parameter, and limiting the number of returned search result hits to 1.

**The conditionals**

If the search produces at least one result, the top hit Artist Object (contained within `results['artists']['items'][0]`) is extracted and assigned to the `artists_found` dictionary, with the `uri` serving as the dictionary key. Even though I only need the artist URI to generate my playlist, I might want to analyze other artist parameters (e.g. popularity) later, so I might as well grab the whole thing for now.

In [None]:
artists_found = {}
not_found = []
artist_uris = []
# first pass:
for artist in hl_no_dups:
    # try searching unmodified string
    print('SEARCHING... ',artist)
    results = sp.search(artist,type='artist',limit=1)
    # if search produces artist hit(s)
    if results['artists']['total'] > 0:
        top_hit = results['artists']['items'][0]  # take top hit
        official_name = top_hit['name']  # extract name (mostly for verbose printing)
        uri = top_hit['uri']  # extract uri to be used as dictionary key
        artists_found[uri] = top_hit  # save top hit to dictionary w/ uri as key
        artist_uris.append(uri)  # save artist uri to list (to keep order)
        print('Found! ', official_name, uri)
    # if artist not found
    elif results['artists']['total'] == 0:
        not_found.append(artist)
        print('XXXXXX NOT found: ',artist)
        
        

In [36]:
artists_found = {}
not_found = []
artist_uris = []
# first pass:
for artist in miss_no_dups:
    # try searching unmodified string
    print('SEARCHING... ',artist)
    results = sp.search(artist,type='artist',limit=2)
    # if search produces artist hit(s)
    if results['artists']['total'] > 0:
        top_hit = results['artists']['items'][0]  # top hit
        second_hit = results['artists']['items'][1]  # second hit
        # check that artist names for first two hits are not exactly the same
        if top_hit['name'] == second_hit['name']:
            print('########## CHECK SECOND HIT ##########')
        official_name = top_hit['name']  # extract name (mostly for verbose printing)
        uri = top_hit['uri']  # extract uri to be used as dictionary key
        artists_found[uri] = top_hit  # save top hit to dictionary w/ uri as key
        artist_uris.append(uri)  # save artist uri to list (to keep order)
        print('Found! ', official_name, uri)
    # if artist not found
    elif results['artists']['total'] == 0:
        not_found.append(artist)
        print('XXXXXX NOT found: ',artist)
        
        

SEARCHING...  trout steak revival
Found!  Trout Steak Revival spotify:artist:7gf4unCQOlGg7UD38XzqPd
SEARCHING...  left coast country
Found!  Left Coast Country spotify:artist:0lrjVsOuOG3lqrhDzsJN6x
SEARCHING...  dustbowl revival
Found!  Dustbowl Revival spotify:artist:3cB0nIIeIGSuMlw6rnu1dm
SEARCHING...  jared & the mill
Found!  Jared & The Mill spotify:artist:0GklSybv01PPje5GlXFq2i
SEARCHING...  sexy pants
XXXXXX NOT found:  sexy pants
SEARCHING...  internet beef
Found!  Internet Beef spotify:artist:1gyvvV8gEt85zp189XdyaO
SEARCHING...  rap class
Found!  Rap Class spotify:artist:73yoLoaMB8o5AvG3vDd498
SEARCHING...  dave hause & the mermaid
XXXXXX NOT found:  dave hause & the mermaid
SEARCHING...  northcote
Found!  Northcote spotify:artist:0238bOScDVbMG0RBP2NguG
SEARCHING...  kandace springs with special guest jimmie herrod
XXXXXX NOT found:  kandace springs with special guest jimmie herrod
SEARCHING...  the mysti krewe of nimbus' mardi gras party feat. too loose cajun
XXXXXX NOT found:

Found!  Post Animal spotify:artist:4iaDWP59Z3e62DW7YWDbIE
SEARCHING...  twen
Found!  Twenty One Pilots spotify:artist:3YQKmKGau1PzlVlkL1iodx
SEARCHING...  acid mothers temple & the melting paraiso u.f.o.
Found!  Acid Mothers Temple & The Melting Paraiso U.F.O. spotify:artist:4lSb7hjm2q4WTwdpjwcar7
SEARCHING...  my education
Found!  My Education spotify:artist:4paz9ZhZMz5YhQEzWfBQnb
SEARCHING...  an evening with dan mangan
XXXXXX NOT found:  an evening with dan mangan
SEARCHING...  ben watt
Found!  Ben Watt spotify:artist:4RIOH6XCUt1Xr8NLUj2u66
SEARCHING...  whitmer thomas
Found!  Whitmer Thomas spotify:artist:6conzu32k403L1Zr3xWTr1
SEARCHING...  u.s. girls
Found!  U.S. Girls spotify:artist:3AHFDfqhSqPBecjQDIOIJA
SEARCHING...  bonjay
Found!  Bonjay spotify:artist:12Jc0PCNtdYxqlBev0EqGJ
SEARCHING...  quasi
Found!  Quasimoto spotify:artist:1rJkz5vopfGxTUGFNB3o4G
SEARCHING...  the cave singers
Found!  The Cave Singers spotify:artist:4SjCvf9Ctuz369ZKAnjkZP
SEARCHING...  somesurprises
Found!

Let's take a look at the official names of the artists we managed to find in Spotify:

In [37]:
for each in artists_found.values():
    print(each['name'])

Trout Steak Revival
Left Coast Country
Dustbowl Revival
Jared & The Mill
Internet Beef
Rap Class
Northcote
The Soileau Zydeco Band
Siren and the Sea
Blossoms
Snugsworth
Slow Hollows
Fox Academy
Eyelids
Zebra Hunt
Carsie Blanton
Jamie Drake
Hikes
Childspeak
Lower Dens
Ami Dang
Habibi
Rudy De Anda
Shopping
Automatic
Mope Grooves
Blackwater Holylight
Abronia
Night Swims
The Ballroom Thieves
Harlequin Gold
Christopher Paul Stelling
Jacob Miller
Advance Base
Claire Cronin
Ruth Garbus
Casey Neill & The Norway Rats
Califone
Marisa Anderson
Blackbird Blackbird
Megan Diana
Evan Thomas Way & The Phasers
Isabeau Waia'u Walker
Roselit Bone
Frazey Ford
Alec Shaw
Gladie
Delicate Steve
Al Lover
Potty Mouth
Sir Babygirl
Ata Kak
Town Mountain
Laney Lou and the Bird Dogs
Good Morning
Vita and the Woolf
Stealing Sheep
Vundabar
Great Grandpa
Dumbo Gets Mad
Sea Moya
Ratboys
Ellise
The Mauskovic Dance Band
Orquestra Pacifico Tropical
Beach Slang
The Aquadolls
Social Animals
Rainbow Girls
Goodnight, Texas
Ez

In [41]:
print(len(miss_no_dups))
print(len(artist_uris))
print(len(not_found))

142
115
27


### Check point: how many artists were recovered?

What was our success rate of finding artists?

In [None]:
found_count = len(hl_no_dups)
missing_count = len(not_found)
total_count = found_count + missing_count
search_success_rate = found_count/(found_count + missing_count)*100
success_string = "A first pass identified Spotify data for {} out of {} headliners, a {:.2f}% success rate."
print(success_string.format(found_count, total_count, search_success_rate))

In [39]:
found_count = len(miss_no_dups)
missing_count = len(not_found)
total_count = found_count + missing_count
search_success_rate = found_count/(found_count + missing_count)*100
success_string = "A first pass identified Spotify data for {} out of {} artists, a {:.2f}% success rate."
print(success_string.format(found_count, total_count, search_success_rate))

A first pass identified Spotify data for 142 out of 169 artists, a 84.02% success rate.


That's pretty good! 

For now, I'll complete the functionality of this script and add 1 song for each of these artists to a list of `tracks_to_add`.

In [None]:
artist_uris[:3]

In [None]:
# pull top tracks 
tracks_to_add = []
for artist_uri in artist_uris:
    # retrieve list of top tracks
    top_tracks = sp.artist_top_tracks(artist_uri)['tracks']
    # find first track in top tracks that is credited to a single artist
    for track in top_tracks:
        if (len(track['artists'])) > 1:
            continue
        else:
            top_track_uri = track['uri']
            break
    tracks_to_add.append(top_track_uri)

tracks_to_add

In [142]:
# debugging / testing
check_artists = ['spotify:artist:4RIOH6XCUt1Xr8NLUj2u66', 'spotify:artist:1a6tqLJPUs4DBAnNUZkr2O']
tracks_to_add = []
for artist_uri in check_artists:
    # retrieve list of top tracks
    top_tracks = sp.artist_top_tracks(artist_uri)['tracks']
    # find first track in top tracks that is credited to a single artist
    # (this avoids remixes and one-offs and hopefully better captures an artist's sound)
    for track in top_tracks:
        if (len(track['artists'])) > 1:
            continue
        else:
            top_track_uri = track['uri']
            print(track['name'])
            break
    tracks_to_add.append(top_track_uri)

tracks_to_add

North Marine Drive
Call The Captain


['spotify:track:23SGoB6pbBXx2qcrM1VEYU',
 'spotify:track:7xsjI11alpcfweV1y75dSs']

In [None]:
# replace all tracks with tracks_to_add
playlist_id = 'spotify:user:1237403078:playlist:1ndO4967sqMKFsMapRUVXe'
user = username
result = sp.user_playlist_replace_tracks(user, playlist_id, tracks = tracks_to_add)

In [None]:

# get top tracks' uris for artist
top_tracks = sp.artist_top_tracks(artist_uri)['tracks']
top_tracks_uri = [track['uri'] for track in top_tracks]
top_tracks_uri
# add top tracks to 'add_me' list
for each in top_tracks_uri[0:2]:
    add_me.append(each)
# once add_me is complete,
# replace all tracks with add_me
playlist_id = pl_uri
user = username
tracks = add_me
result = sp.user_playlist_replace_tracks(user, playlist_id, tracks)


# Debugging

I listened to the playlist and found a few cases that were weird:
1. Steep Canyon Rangers - the track features Steve Martin, who is not on tour with them
2. Ben Watt - featured track "Bright Star - Sunset Mix" is a collab with 2 other artists and sounds nothing like Ben Watt's singer songwriter-y stuff.

Debug strategy: Check URI's for these songs and get track objects. Inside the track object, extract the 'artists' array. If array > 1, try second most popular track.

In [124]:
bright_star_uri = 'spotify:track:0kdli9bGgWufvIgeKtmtF5'
ben_watt_uri = 'spotify:artist:4RIOH6XCUt1Xr8NLUj2u66'
steep_canyon_rangers_uri = 'spotify:artist:1a6tqLJPUs4DBAnNUZkr2O'
check_artists = ['spotify:artist:4RIOH6XCUt1Xr8NLUj2u66', 'spotify:artist:1a6tqLJPUs4DBAnNUZkr2O']

In [122]:
result = sp.track(bright_star_uri)
num_artists_on_track = len(result['artists'])

In [141]:
artist_uri = ben_watt_uri
result = sp.artist_top_tracks(artist_uri)['tracks']

# find first track in top tracks that is only the artist
for each in result:
    if (len(each['artists'])) > 1:
        continue
    else:
        print(each['uri'], '\t', each['name'])
        break

spotify:track:23SGoB6pbBXx2qcrM1VEYU 	 North Marine Drive


In [137]:
for each in result:
    print(each['uri'])

spotify:track:0kdli9bGgWufvIgeKtmtF5
spotify:track:23SGoB6pbBXx2qcrM1VEYU
spotify:track:0FmXimDKUUtcP2aodW828S
spotify:track:1IwTgB0lEWnQWNXkng9wU7
spotify:track:4TbQsImMHpfAI0XzSuWO3d
spotify:track:0h6OiVfh9ICxDfv01XVgO4
spotify:track:3Uzg5xASF1RMBngOgBk3iH
spotify:track:4p843hcUcVM2rZgQvhoZOL
spotify:track:5DtnDC47CQuGiGHVZ92qx0
spotify:track:48YemRTa3pBlnmE1gnXomX


## Additional string cleaning to increase search result success rate

That leaves 24 strings with no Spotify artist results, at least searching as is. We'll need to get into the nitty gritty to address these with a general strategy of:
- identifying patterns that might explain search failure
- manually checking a fair number
- developing a strategy to address edge cases in iterative search passes

In [None]:
not_found

Some of these strings simply do not have a Spotify entry. Take, for example, the boy band "Sexy Pants". This is a real band, but they do not appear to have any Spotify data.

Here's what no results look like:

In [None]:
artist = 'sexy pants'
results = sp.search(artist, type='artist')
results['artists']

Another is Phoebe Robinson, who has a podcast called "Sooo Many White Guys" but does not have a Spotify artist URI.

But some of these like `'stumpfest ix: pallbearer'` contain an artist (in this case, Pallbearer), and some sort of modifier, often followed or preceeded by ':'. In this case, 'Stumpfest IX' refers to a local music festival. Another similar case is `'jason lytle: acoustic piano and guitar'`. As we can see from these two examples, sometimes the artist appears before the ':', and sometimes after.

Here are some other observations of strings we might expect to see regularly on a billing but reduce our search success, as well as strategies to handle them:

string | strategy | example
---|---|---
`unplugged` | remove string | `yacht rock revue unplugged`
`with (special guest)` | search either side of | `kandace springs with special guest jimmie herrod`, `caitlin canty with noam pikelny`
`solo` | remove string | `hiss golden messenger (solo)`
`acoustic` | remove string | `joe henry solo & acoustic`
`&` | search either side of | `dave hause & the mermaid`
`:` | search either side of | `stumfest ix: pallbearer`
`moved to` | skip and remove event - has been relocated | `moved to aladdin theater: beach bunny`
`(...)` | uncertain, see below | `polaris (from the adventures of pete & pete)`
`feat.` or `featuring` | search either side of | "`he mysti krewe of nimbus' mardi gras party feat. too loose cajun`

It's important to remember that for cases where we break the string up ('dave hause & the mermaid'), we might get results for each side, which may be a true collaboration, or simply a title with 1 artist and other stuff (as in the case of 'dave hause & the mermaid'). The best thing to do here may be to flag these cases and manually check them after the program runs each week.

Combinations also exist, such as `'joe henry solo & acoustic'`, which requires us to think about the order of operations we should attempt here.

There are also cases, like `'brent amaker deathsquad'`, where the artist ("brent amaker") is searchable, but the other word(s) ("deathsquad") throw off the search. Another example of this is: `'danny carey trio'` We might want to save these for a "last pass" that involves iteratively searching pairs of words, or perhaps using NLTK to identify "proper nouns" and searching those.

Sometimes, trimming our search by removing certain strings (e.g. text within parentheses) may produce a hit to a different band than intended, as with `polaris (from the adventures of pete & pete)`. The top search hit on Spotify for "Polaris" is psych rock band, not the Polaris billed here. However, using the second half of the search string, perhaps in a general search, would likely turn up the correct artist. I tested this string in the general seach below, and do retrieve a correct album hit, from which I can extract the artist URI.

In [36]:
results = sp.search('adventures of pete & pete')
results

NameError: name 'sp' is not defined

### The Not Founds: Breaking up complex strings and dealing with special characters

In [None]:
# searching strings w/ colons ":"
still_not_found = []
add_to_found = []

for artist in not_found:
    search_terms = []
    
    if ':' in artist:
        print("SEARCHING...:",artist)
        search_terms = artist.split(':')  # split string at colon
        
        for search_str in search_terms: # for each string
            good_match = False
            # search spotify
            results = sp.search(search_str,type='artist')
            # if result is found
            if results['artists']['total'] > 0:
                good_match = True  # indicate a match has been found
                official_name = results['artists']['items'][0]['name']
                add_to_found.append(official_name)
                break
            else:
                print("No results for: ",search_str)
            
        if good_match == True:
            print("FOUND: ", official_name)
        else:
            print("NOT FOUND: ", artist)
            still_not_found.append(artist)

In [None]:
still_not_found

In [None]:
for search_str in not_found:
    print(search_str)
    search_terms = []
    if ':' in artist:
        search_terms = artist.split(':')
    elif ' - ' in artist:
        search_terms = artist.split(' - ')
    else:
        search_terms.append(artist)
    print(search_terms)

In [None]:
not_found = []
found_artists = []

for artist in hl_flat:
    search_str = artist
    results = sp.search(search_str,type='artist')
    if results['artists']['total'] == 0:
        print("Artist not found: ",search_str)
        not_found.append(search_str)
    else:
        print("FOUND: ",results['artists']['items'][0]['name'])
        found_artists.append(results['artists']['items'][0]['name'])

In [None]:
not_found

# Scratch paper

In [None]:
## get artist uri
search_str = 'Incubus'
results = sp.search(search_str,type='artist')
artist_uri = results['artists']['items'][0]['uri']
artist_uri

In [None]:
# inspect search results for artist search
search_results = result['items']

for result in search_results:
    print(result['name'])

In [None]:
# get top tracks' uris for artist
top_tracks = sp.artist_top_tracks(artist_uri)['tracks']
top_tracks_uri = [track['uri'] for track in top_tracks]
top_tracks_uri

In [None]:
# get top tracks' uris for artist
top_tracks = sp.artist_top_tracks(artist_uri)['tracks']
top_tracks_uri = [track['uri'] for track in top_tracks]
top_tracks_uri
# add top tracks to 'add_me' list
for each in top_tracks_uri[0:2]:
    add_me.append(each)

In [None]:
add_me

In [None]:
# get top tracks' uris for artist
top_tracks = sp.artist_top_tracks(artist_uri)['tracks']
top_tracks_uri = [track['uri'] for track in top_tracks]
top_tracks_uri
# add top tracks to 'add_me' list
for each in top_tracks_uri[0:2]:
    add_me.append(each)
# once add_me is complete,
# replace all tracks with add_me
playlist_id = pl_uri
user = username
tracks = add_me
result = sp.user_playlist_replace_tracks(user, playlist_id, tracks)