In [1]:
import pandas
import re
import numpy as np
import urllib
from bs4 import BeautifulSoup as bs
from pprint import pprint
import gc
pandas.set_option('display.max_colwidth', -1)
datadir = '/home/idies/workspace/Storage/raddick/musical_folksonomy/data/firstpass/'
intermediate_datadir = datadir + 'intermediate_data/'

debug = 1

print('imported')

imported


In [2]:
music_df = pandas.read_csv(datadir+'cleaned_audio_data.csv', low_memory=False)
print('read {0:,.0f} rows...'.format(len(music_df)))
music_df = music_df.rename(columns = {'ID': 'id', 'Source': 'tracker_site', 'Sample File Name': 'filename'})
#music_df.groupby('id').size().sort_values(ascending=False) # id is unique
music_df = music_df.set_index('id')
#music_df.sample(3)
music_df[music_df['filename'].apply(lambda x: '.cue' in x)]

print('Step 0: remove that .cue file per Scott on 2018-02-08...')
music_df = music_df[~music_df['filename'].apply(lambda x: '.cue' in x)]
print('Remaining filenames: {0:,.0f}'.format(len(music_df)))

print('Parsing into stem and filetype...')
music_df = music_df.assign(filetype = music_df['filename'].apply(lambda x: x.lower().split('.')[-1].strip()))
music_df = music_df.assign(stem = music_df['filename'].apply(lambda x: x[0:x.rfind('.')]))

print('backing up...')
music_df_bk = music_df
print('Done!')

music_df.sample(3)

read 2,973 rows...
Step 0: remove that .cue file per Scott on 2018-02-08...
Remaining filenames: 2,972
Parsing into stem and filetype...
backing up...
Done!


Unnamed: 0_level_0,tracker_site,filename,filetype,stem
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
587800.html,Etree,wsp2000-04-24s2t02.flac,flac,wsp2000-04-24s2t02
587513.html,Etree,hayley2016-07-06d01t08.flac,flac,hayley2016-07-06d01t08
127214.html,TradersDen,pj1998-09-19d2t06.flac,flac,pj1998-09-19d2t06


In [3]:
print('retrieving from backup...')
music_df = music_df_bk
relict = []
relict.append({'fmt': 'yyyy-mm-dd', 're': re.compile('(19|20)[\d]{2}\-[\d]{2}\-[\d]{2}')})
relict.append({'fmt': 'yy-mm-dd', 're': re.compile('[\d]{2}\-[\d]{2}\-[\d]{2}')})
relict.append({'fmt': 'eightnumbers', 're': re.compile('[\d]{8}')})
relict.append({'fmt': 'otherseps', 're': re.compile('[\d]{1,4}[_,\.\-\s]+[\d]{1,4}[_,\.\-\s]+[\d]{1,4}')})

#relict.append({'fmt': 'hasmos', 're': re.compile('[\d]*[,\.\-\s]*([j|J]an|[f|F]eb|[m|M]ar|[a|A]pr|[m|M]ay|[j|J]un|[j|J]ul|[a|A]ug|[s|S]ep|[o|O]ct|[n|N]ov|[d|D]ec)[a-z]*[\d]*[,\.\-\s]*')})
#re.compile('([j|J]anuary|[f|F]ebruary|[m|M]arch|[a|A]pril|[m|M]ay|[j|J]une|[j|J]uly|[a|A]ugust|[s|S]eptember|[o|O]ctober|[n|N]ovember|[d|D]ecember)[,\.\-\s]*[\d]+[st|nd|rd|th]*[,\.\-\s]*[\d]{2,4}')}

print('thinking...')

df = pandas.DataFrame()
for thisre in relict:
    if (debug > 0):
        print('Assigning dates with format {:}...'.format(thisre['fmt']))
    dfi = pandas.DataFrame(music_df['stem'][music_df['stem'].apply(lambda x: re.search(thisre['re'], x) != None)])
    dfi = dfi.drop(dfi[dfi.index.map(lambda x: x in df.index)].index, axis=0)
    dfi = dfi.assign(date_string = dfi['stem'].apply(lambda x: x[re.search(thisre['re'], x).start():re.search(thisre['re'], x).end()]))
    if (thisre['fmt'] == 'yy-mm-dd'):
        dfi = dfi.assign(thedate = pandas.to_datetime(dfi['date_string'], yearfirst=True, errors='coerce'))
    else:
        dfi = dfi.assign(thedate = pandas.to_datetime(dfi['date_string'], errors='coerce'))
    dfi = dfi[dfi['thedate'].notnull()].assign(date_format = thisre['fmt'])
    df = pandas.concat((df, dfi[dfi['date_format'] == thisre['fmt']]), axis=0)
    
# change some by hand
if (debug > 0):
    print('assigning full dates by hand...')
df.loc['566789.html', ['date_string', 'thedate', 'date_format']] = ['06-14-1980', pandas.to_datetime('1980-06-14'), 'byhand']
df.loc['5178afc019031e66162eb54e11ece20b3a1d03b6', ['date_string', 'thedate', 'date_format']] = ['07-25-1998', pandas.to_datetime('1998-07-25'), 'byhand']
df.loc['567322.html', ['date_string', 'thedate', 'date_format']] = [np.nan, pandas.NaT, 'byhand']
df.loc['565628.html', ['date_string', 'thedate', 'date_format']] = ['8 mar 10', pandas.to_datetime('2010-08-10'), 'byhand']
df.loc['567156.html', ['date_string', 'thedate', 'date_format']] = ['26 apr 14', pandas.to_datetime('2014-04-26'), 'byhand']
df.loc['565970.html', ['date_string', 'thedate', 'date_format']] = ['June 6 2016', pandas.to_datetime('2016-06-06'), 'byhand']
df.loc['565970.html', ['date_string', 'thedate', 'date_format']] = ['160717', pandas.to_datetime('2016-07-17'), 'byhand']
df.loc['567639.html', ['date_string', 'thedate', 'date_format']] = ['75-0103', pandas.to_datetime('1975-01-03'), 'byhand']
df.loc['567578.html', ['date_string', 'thedate', 'date_format']] = ['160708', pandas.to_datetime('2016-07-08'), 'byhand']
df.loc['567872.html', ['date_string', 'thedate', 'date_format']] = ['160730', pandas.to_datetime('2016-07-30'), 'byhand']
df.loc['3c39e32bf0918ec3190ecda08442eba0d8b2faa8', ['date_string', 'thedate', 'date_format']] = ['1997June22', pandas.to_datetime('1997-06-22'), 'byhand']
df.loc['566223.html', ['date_string', 'thedate', 'date_format']] = ['880709', pandas.to_datetime('1988-07-09'), 'byhand']
df.loc['567671.html', ['date_string', 'thedate', 'date_format']] = ['891119', pandas.to_datetime('1989-11-19'), 'byhand']
df.loc['565762.html', ['date_string', 'thedate', 'date_format']] = ['160703', pandas.to_datetime('2016-07-03'), 'byhand']
df.loc['565781.html', ['date_string', 'thedate', 'date_format']] = ['160703', pandas.to_datetime('2016-07-03'), 'byhand']
df.loc['566104.html', ['date_string', 'thedate', 'date_format']] = ['June 25, 2016', pandas.to_datetime('2016-06-25'), 'byhand']
df.loc['566252.html', ['date_string', 'thedate', 'date_format']] = ['July 9th 2016', pandas.to_datetime('2016-07-09'), 'byhand']
df.loc['566783.html', ['date_string', 'thedate', 'date_format']] = ['July 15, 2016', pandas.to_datetime('2016-07-15'), 'byhand']
df.loc['565664.html', ['date_string', 'thedate', 'date_format']] = ['053116', pandas.to_datetime('2016-05-31'), 'byhand']
df.loc['566132.html', ['date_string', 'thedate', 'date_format']] = ['070616', pandas.to_datetime('2016-07-06'), 'byhand']
df.loc['566680.html', ['date_string', 'thedate', 'date_format']] = ['071116', pandas.to_datetime('2007-11-16'), 'byhand']
df.loc['565954.html', ['date_string', 'thedate', 'date_format']] = ['April 29, 2016', pandas.to_datetime('2016-04-29'), 'byhand']
df.loc['587640.html', ['date_string', 'thedate', 'date_format']] = ['7-8-16', pandas.to_datetime('2016-07-08'), 'byhand']
df.loc['565793.html', ['date_string', 'thedate', 'date_format']] = ['6 27 81', pandas.to_datetime('1981-06-27'), 'byhand']
df.loc['566234.html', ['date_string', 'thedate', 'date_format']] = ['18 Nov 79', pandas.to_datetime('1979-11-18'), 'byhand']
df.loc['567140.html', ['date_string', 'thedate', 'date_format']] = ['July 20th 2016', pandas.to_datetime('2016-07-20'), 'byhand']
df.loc['587588.html', ['date_string', 'thedate', 'date_format']] = ['7-10-16', pandas.to_datetime('2016-07-10'), 'byhand']
df.loc['566114.html', ['date_string', 'thedate', 'date_format']] = ['June 25, 2016', pandas.to_datetime('2016-06-25'), 'byhand']
df.loc['565891.html', ['date_string', 'thedate', 'date_format']] = ['Feb18-1989', pandas.to_datetime('1989-02-18'), 'byhand']
df.loc['566219.html', ['date_string', 'thedate', 'date_format']] = ['Feb17-1990', pandas.to_datetime('1990-02-17'), 'byhand']
df.loc['566264.html', ['date_string', 'thedate', 'date_format']] = ['July 10, 2016', pandas.to_datetime('2016-07-10'), 'byhand']
df.loc['567616.html', ['date_string', 'thedate', 'date_format']] = ['1992-Sep-20', pandas.to_datetime('1992-09-20'), 'byhand']
df.loc['567617.html', ['date_string', 'thedate', 'date_format']] = ['1995-Apr-01', pandas.to_datetime('1995-04-01'), 'byhand']
df.loc['567621.html', ['date_string', 'thedate', 'date_format']] = ['1995-Apr-20', pandas.to_datetime('1995-04-20'), 'byhand']
df.loc['567627.html', ['date_string', 'thedate', 'date_format']] = ['1995-Apr-27', pandas.to_datetime('1995-04-27'), 'byhand']
df.loc['567642.html', ['date_string', 'thedate', 'date_format']] = ['1995-Apr-28', pandas.to_datetime('1995-04-28'), 'byhand']
df.loc['126880.html', ['date_string', 'thedate', 'date_format']] = ['88101', pandas.to_datetime('1988-10-01'), 'byhand']
df.loc['567790.html', ['date_string', 'thedate', 'date_format']] = ['2016-07.26', pandas.to_datetime('2016-07-26'), 'byhand']
df.loc['566127.html', ['date_string', 'thedate', 'date_format']] = ['Mar 7 1980', pandas.to_datetime('1980-03-07'), 'byhand']
df.loc['587454.html', ['date_string', 'thedate', 'date_format']] = ['1-30-16', pandas.to_datetime('2016-01-30'), 'byhand']
df.loc['587339.html', ['date_string', 'thedate', 'date_format']] = ['2-19-16', pandas.to_datetime('2016-02-19'), 'byhand']
df.loc['587337.html', ['date_string', 'thedate', 'date_format']] = ['6-3-16', pandas.to_datetime('2016-06-03'), 'byhand']
df.loc['587572.html', ['date_string', 'thedate', 'date_format']] = ['7-9-16', pandas.to_datetime('2016-07-09'), 'byhand']
df.loc['566157.html', ['date_string', 'thedate', 'date_format']] = ['24Nov79', pandas.to_datetime('2016-07-09'), 'byhand']
df.loc['565553.html', ['date_string', 'thedate', 'date_format']] = ['1997-6-27', pandas.to_datetime('1997-06-27'), 'byhand']
df.loc['566148.html', ['date_string', 'thedate', 'date_format']] = ['121106', pandas.to_datetime('2012-11-06'), 'byhand']
df.loc['566331.html', ['date_string', 'thedate', 'date_format']] = ['121106', pandas.to_datetime('2012-11-06'), 'byhand']
df.loc['566657.html', ['date_string', 'thedate', 'date_format']] = ['July 7th 2016', pandas.to_datetime('2016-07-07'), 'byhand']
df.loc['566657.html', ['date_string', 'thedate', 'date_format']] = ['September 2nd 2011', pandas.to_datetime('2011-09-02'), 'byhand']
df.loc['565923.html', ['date_string', 'thedate', 'date_format']] = ['July 5th 1997', pandas.to_datetime('1997-07-05'), 'byhand']
df.loc['587606.html', ['date_string', 'thedate', 'date_format']] = ['2016.Jul.11', pandas.to_datetime('2016-07-11'), 'byhand']
df.loc['566066.html', ['date_string', 'thedate', 'date_format']] = ['July 7, 2016', pandas.to_datetime('2016-07-07'), 'byhand']
df.loc['567376.html', ['date_string', 'thedate', 'date_format']] = ['81775_04', pandas.to_datetime('1975-08-17'), 'byhand']
df.loc['567291.html', ['date_string', 'thedate', 'date_format']] = ['000225', pandas.to_datetime('2000-02-25'), 'byhand']
df.loc['567291.html', ['date_string', 'thedate', 'date_format']] = ['201618', pandas.to_datetime('2016-01-08'), 'byhand']
df.loc['567291.html', ['date_string', 'thedate', 'date_format']] = ['201618', pandas.to_datetime('2016-01-08'), 'byhand']
df.loc['567750.html', ['date_string', 'thedate', 'date_format']] = ['2016_07_23', pandas.to_datetime('2016-07-23'), 'byhand']
df.loc['567513.html', ['date_string', 'thedate', 'date_format']] = ['2016_07_22', pandas.to_datetime('2016-07-22'), 'byhand']
df.loc['565561.html', ['date_string', 'thedate', 'date_format']] = ['6 sept 11', pandas.to_datetime('2011-09-06'), 'byhand']
df.loc['567003.html', ['date_string', 'thedate', 'date_format']] = ['160717', pandas.to_datetime('2016-07-17'), 'byhand']
df.loc['567220.html', ['date_string', 'thedate', 'date_format']] = ['July 2, 2016', pandas.to_datetime('2016-07-02'), 'byhand']
df.loc['567053.html', ['date_string', 'thedate', 'date_format']] = ['1981Aug20', pandas.to_datetime('1981-08-20'), 'byhand']
df.loc['567318.html', ['date_string', 'thedate', 'date_format']] = ['September 2nd 2011', pandas.to_datetime('2011-09-02'), 'byhand']
df.loc['567662.html', ['date_string', 'thedate', 'date_format']] = ['2016_07_23', pandas.to_datetime('2016-07-23'), 'byhand']
df.loc['567649.html', ['date_string', 'thedate', 'date_format']] = ['2016_07_23', pandas.to_datetime('2016-07-23'), 'byhand']
df.loc['567572.html', ['date_string', 'thedate', 'date_format']] = ['2016_07_22', pandas.to_datetime('2016-07-22'), 'byhand']
df.loc['566210.html', ['date_string', 'thedate', 'date_format']] = ['201618', pandas.to_datetime('2016-01-08'), 'byhand']


if (debug > 0):
    print('assigning dates with year only by hand...')
ydf = pandas.DataFrame(data=None, columns=['date_string', 'justyear', 'date_format'])
ydf.loc['567683.html', ['date_string', 'justyear', 'date_format']] = ['2017', 2017, 'justyear']
ydf.loc['567810.html', ['date_string', 'justyear', 'date_format']] = ['2016', 2016, 'justyear']
ydf.loc['567271.html', ['date_string', 'justyear', 'date_format']] = ['2016', 2016, 'justyear']
ydf.loc['567323.html', ['date_string', 'justyear', 'date_format']] = ['2017', 2017, 'justyear']
ydf.loc['c23154fca319e57ed53f15a6a84e87a539312555', ['date_string', 'justyear', 'date_format']] = ['1968-xx-xx', 1968, 'justyear']
ydf.loc['567686.html', ['date_string', 'justyear', 'date_format']] = ['1968-xx-xx', 1968, 'justyear']
ydf.loc['17a8a1fb659a7ccbc3df9d710ca2b1be9504c5c4', ['date_string', 'justyear', 'date_format']] = ['1968-xx-xx', 1968, 'justyear']
ydf.loc['567687.html', ['date_string', 'justyear', 'date_format']] = ['1968-xx-xx', 1968, 'justyear']
ydf.loc['566034.html', ['date_string', 'justyear', 'date_format']] = ['1978-xx-xx', 1978, 'justyear']
ydf.loc['127033.html', ['date_string', 'justyear', 'date_format']] = ['1991-xx-xx', 1991, 'justyear']
ydf.loc['587815.html', ['date_string', 'justyear', 'date_format']] = ['2016', 2016, 'justyear']
ydf.loc['565740.html', ['date_string', 'justyear', 'date_format']] = ['2016', 2016, 'justyear']
ydf.loc['8a7be6c05e405264de580672fbe6619a79955c43', ['date_string', 'justyear', 'date_format']] = ['1975-00-00', 1975, 'justyear']
ydf.loc['jg1975-xx-xx.boswell-smith.flac16', ['date_string', 'justyear', 'date_format']] = ['1975-00-00', 1975, 'justyear']
ydf.loc['566119.html', ['date_string', 'justyear', 'date_format']] = ['2016', 2016, 'justyear']
ydf.loc['127062.html', ['date_string', 'justyear', 'date_format']] = ['2017', 2017, 'justyear']
ydf.loc['e059e8d39da4b75145b1e434b0569df9c8f59367', ['date_string', 'justyear', 'date_format']] = ['95', 1995, 'justyear']

df = df.assign(justyear = np.nan)
df = df.append(ydf, sort=False)

if (debug > 0):
    print('assigning dates with year and month only by hand...')
ymdf = pandas.DataFrame(data=None, columns=['date_string', 'justyear', 'justmonth', 'date_format'])
ymdf.loc['566208.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['1972_01', 1972, 1, 'justyearmonth']
ymdf.loc['566814.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['2016_01', 2016, 1, 'justyearmonth']
ymdf.loc['566756.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['2011_01', 2011, 1, 'justyearmonth']
ymdf.loc['566758.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['19910700', 1991, 7, 'justyearmonth']
ymdf.loc['567394.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['1986-May', 1986, 5, 'justyearmonth']
ymdf.loc['566506.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['2016-07', 2016, 7, 'justyearmonth']
ymdf.loc['587452.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['1975-03-xx', 1975, 3, 'justyearmonth']
ymdf.loc['566072.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['1985-10-xx', 1985, 10, 'justyearmonth']
ymdf.loc['567186.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['1986-06-xx', 1986, 6, 'justyearmonth']
ymdf.loc['567117.html', ['date_string', 'justyear', 'justmonth', 'date_format']] = ['1986-06-xx', 1986, 6, 'justyearmonth']
df = df.assign(justmonth = np.nan)
df = df.append(ymdf, sort=False)

#if (debug > 0):
#    print('assigning date format of nodate to all other stems...')
#nddf = pandas.DataFrame()
#nddf = nddf.assign(stem = music_df['stem'][music_df.index.map(lambda x: x not in df.index.tolist())])
#nddf = nddf.assign(date_format = 'nodate')
#df = df.append(nddf, sort=False)

if (debug > 0):
    print('Joining date parsing info back to main dataframe...')
df.index.name = 'id'
music_df = music_df.join(df[['date_string', 'thedate', 'justyear', 'justmonth', 'date_format']])

print('backing up...')
music_df_bk = music_df

showorder = ['yyyy-mm-dd', 'yy-mm-dd', 'eightnumbers', 'otherseps', 'byhand', 'justyear', 'justyearmonth']
print('\n',df.groupby('date_format').size().reindex(showorder))

print('writing out intermediate data, then dropping...')
df.to_csv(intermediate_datadir + 'date_parsing.csv', encoding='utf-8')
df = df.reset_index().iloc[0:0].drop(df.reset_index().columns.tolist(), axis=1)  # trick to kill a dataframe: https://stackoverflow.com/questions/39173992/drop-all-data-in-a-pandas-dataframe

gc.collect()
print('Done!')

#df[['stem', 'date_string', 'thedate', 'date_format']][df['date_format'] == 'otherseps'].sort_values('thedate')
# there were 1019 unique values to check, then 949, ends at 919...
#music_df['stem'][music_df.index.map(lambda x: x not in df.index.tolist())
#                ].drop_duplicates().sort_values()#[940:]


retrieving from backup...
thinking...
Assigning dates with format yyyy-mm-dd...
Assigning dates with format yy-mm-dd...
Assigning dates with format eightnumbers...
Assigning dates with format otherseps...
assigning full dates by hand...
assigning dates with year only by hand...
assigning dates with year and month only by hand...
Joining date parsing info back to main dataframe...
backing up...

 date_format
yyyy-mm-dd       1175
yy-mm-dd         131 
eightnumbers     97  
otherseps        56  
byhand           63  
justyear         17  
justyearmonth    10  
dtype: int64
writing out intermediate data, then dropping...
Done!


# Parse band strings into bands

## First, get what comes before and after the date

### Functions to parse directly before and after date string

In [4]:
def get_predate(row):
    if (str(row['date_string']) == 'nan'):
        return np.nan
    else:
        return row['stem'][0:row['stem'].find(row['date_string'])]

def get_postdate(row):
    if (str(row['date_string']) == 'nan'):
        return np.nan
    else:
        x = row['stem'][row['stem'].find(row['date_string']):]
        y = x[len(row['date_string']):]
        return y
print('functions created!')


functions created!


In [5]:
print('getting from backup...')
music_df = music_df_bk

startstrip = re.compile('^(\s|\-|_)+')
starts_with_number = re.compile('^[\d]+[\.\-\_\ \(\)\s]*')
starts_with_d_or_t = re.compile('^(d|D|t|T)+[\.\-\(\)\s]*[\d]+[t\d\_\.\s\-]*')
detailsfinder = re.compile('([@]+|\s*\-+\s+|[L|l]ore?ley)+')

if (debug > 0):
    print('clearing out old df...')
else:
    print('thinking...')

if (debug > 0):
    print('parsing around date strings...')
    
df = pandas.DataFrame(data=music_df[['stem', 'date_string']], columns=['stem', 'date_string'])
df = df.assign(predate = df.apply(lambda row: get_predate(row), axis=1))
df = df.assign(postdate = df.apply(lambda row: get_postdate(row), axis=1))

if (debug > 0):
    print('\nThere are date strings in {0:,.0f} filenames...'.format(len(df['date_string'].dropna())))

df = df.assign(postdate_no_predate = 
               df['postdate'][df['predate'] == '']
              )
if (debug > 0):
    print('\tThere are {0:,.0f} filenames with blank predate and real postdate...'.format(len(df['postdate_no_predate'].dropna())))#.drop_duplicates().sort_values())
    
df['postdate_no_predate'][
    (df['postdate_no_predate'].apply(lambda x: re.search(startstrip, str(x)) != None))
].apply(lambda x: x[re.search(startstrip, str(x)).end():].strip()).drop_duplicates().sort_values()

if (debug > 0):
    print('\tStripping startspace from {0:,.0f} unique supposed band names in {1:,.0f} filenames...'.format(
        len(df['postdate_no_predate'][df['postdate_no_predate'].apply(lambda x: re.search(startstrip, str(x)) != None)].drop_duplicates()),
        len(df['postdate_no_predate'][df['postdate_no_predate'].apply(lambda x: re.search(startstrip, str(x)) != None)])
    ))

df = df.assign(postdate_no_predate_stripped = 
               df['postdate_no_predate'][
                   df['postdate_no_predate'].apply(lambda x: re.search(startstrip, str(x)) != None)
               ].apply(lambda x: x[re.search(startstrip, str(x)).end():].strip())
)

df.loc[df['postdate_no_predate'].notnull() & df['postdate_no_predate_stripped'].isnull(), 'postdate_no_predate_stripped'] = df['postdate_no_predate']

df = df.assign(maybe_band_string = 
               df['postdate_no_predate_stripped'][df['postdate_no_predate_stripped'].notnull()]
              )
df.loc[df['maybe_band_string'].isnull(), 'maybe_band_string'] = df['predate']

# Add filenames with no dates
df.loc[df['maybe_band_string'].isnull(), 'maybe_band_string'] = df['stem']

print('\nWe have identified {0:,.0f} unique values of maybe band strings in {1:,.0f} filenames...\n'.format(
    len(df['maybe_band_string'].dropna().drop_duplicates()),
    len(df['maybe_band_string'].dropna())
))

print('backing up...')
df_bk = df

print('done!')




getting from backup...
clearing out old df...
parsing around date strings...

There are date strings in 1,548 filenames...
	There are 49 filenames with blank predate and real postdate...
	Stripping startspace from 37 unique supposed band names in 44 filenames...

We have identified 1,789 unique values of maybe band strings in 2,972 filenames...

backing up...
done!


In [6]:
print('getting from backup...')
df = df_bk
music_df = music_df_bk

# STRIP OFF STARTING NUMBERS...
if (debug > 0):
    print('If the proposed band name starts with number(s) and the like, strip...')
    if (debug > 1):
        print('\tLe regex: {:}'.format(starts_with_number))
    print('\tFound {0:,.0f} such unique strings in {1:,.0f} filenames...'.format(
        len(df['maybe_band_string'][df['maybe_band_string'].apply(lambda x: re.search(starts_with_number, str(x)) != None)].drop_duplicates()),
        len(df['maybe_band_string'][df['maybe_band_string'].apply(lambda x: re.search(starts_with_number, str(x)) != None)])
    ))

df = df.assign(maybe_band_string_no_startnumber = 
               df['maybe_band_string'][
                   df['maybe_band_string'].apply(lambda x: re.search(starts_with_number, str(x)) != None)
               ].apply(lambda x: x[re.search(starts_with_number, x).end():])
              )
df.loc[df['maybe_band_string_no_startnumber'].isnull(), 'maybe_band_string_no_startnumber'] = df['maybe_band_string']


# STRIP OFF STARTING D/T...
if (debug > 0):
    print('If the proposed band name starts with d or t and maybe numbers, strip...')
    if (debug > 1):
        print('\tLe regex: {:}'.format(starts_with_d_or_t))
    print('\tFound {0:,.0f} such unique strings in {1:,.0f} filenames...'.format(
        len(df['maybe_band_string_no_startnumber'][df['maybe_band_string_no_startnumber'].apply(lambda x: re.search(starts_with_d_or_t, str(x)) != None)].drop_duplicates()),
        len(df['maybe_band_string_no_startnumber'][df['maybe_band_string_no_startnumber'].apply(lambda x: re.search(starts_with_d_or_t, str(x)) != None)])
    ))

df = df.assign(maybe_band_string_no_startnumber_no_td = 
               df['maybe_band_string_no_startnumber'][df['maybe_band_string_no_startnumber'].apply(lambda x: re.search(starts_with_d_or_t, str(x)) != None)
                                                     ].apply(lambda x: x[re.search(starts_with_d_or_t, x).end():].strip())
              )
df.loc[df['maybe_band_string_no_startnumber_no_td'].isnull(), 'maybe_band_string_no_startnumber_no_td'] = df['maybe_band_string_no_startnumber']

# STRIP OFF EXTRA DETAIL WEIRDNESS FROM THE END
#loreley = re.compile('[L|l]ore?ley')

if (debug > 0):
    print('Look for possible extra detail after band name or other weirdness...')
    if (debug > 1):
        print('\tLe regex: {:}'.format(detailsfinder))
    print('\tFound {0:,.0f} such unique strings in {1:,.0f} filenames...'.format(
        len(df['maybe_band_string_no_startnumber_no_td'][df['maybe_band_string_no_startnumber_no_td'].apply(lambda x: re.search(detailsfinder, str(x)) != None)].drop_duplicates()),
        len(df['maybe_band_string_no_startnumber_no_td'][df['maybe_band_string_no_startnumber_no_td'].apply(lambda x: re.search(detailsfinder, str(x)) != None)])
    ))

df = df.assign(maybe_band_string_no_startnumber_no_td_no_extras = 
               df['maybe_band_string_no_startnumber_no_td'][df['maybe_band_string_no_startnumber_no_td'].apply(lambda x: re.search(detailsfinder, str(x)) != None)
                                                           ].apply(lambda x: x[:re.search(detailsfinder, str(x)).start()].strip())
              )
df.loc[df['maybe_band_string_no_startnumber_no_td_no_extras'].isnull(), 'maybe_band_string_no_startnumber_no_td_no_extras'] = df['maybe_band_string_no_startnumber_no_td']


endstrip = re.compile('(\s|\-|_|@)+$')

if (debug > 0):
    print('Strip ending spacer characters...')
    if (debug > 1):
        print('\tLe regex: {:}'.format(endstrip))
    print('\tFound {0:,.0f} such unique strings in {1:,.0f} filenames...'.format(
        len(df['maybe_band_string_no_startnumber_no_td_no_extras'][df['maybe_band_string_no_startnumber_no_td_no_extras'].apply(lambda x: re.search(endstrip, str(x)) != None)].drop_duplicates()),
        len(df['maybe_band_string_no_startnumber_no_td_no_extras'][df['maybe_band_string_no_startnumber_no_td_no_extras'].apply(lambda x: re.search(endstrip, str(x)) != None)])
    ))

df = df.assign(maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers = 
               df['maybe_band_string_no_startnumber_no_td_no_extras'][df['maybe_band_string_no_startnumber_no_td_no_extras'].apply(lambda x: re.search(endstrip, str(x)) != None)
                                                                     ].apply(lambda x: x[0:re.search(endstrip, str(x)).start()].strip())
              )
df.loc[df['maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers'].isnull(), 'maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers'] = df['maybe_band_string_no_startnumber_no_td_no_extras']


wordtrack = re.compile('^([T|t]rack)+[\.\-\(\)\s]*[\d]+')

df = df.assign(maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers_notrack = df['maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers'][
    df['maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers'].apply(lambda x: re.search(wordtrack, x) != None)
].apply(lambda x: (x[0:re.search(wordtrack, x).start()]+x[re.search(wordtrack, x).end():]).strip())
              )
df.loc[df['maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers_notrack'].isnull(), 'maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers_notrack'] = df['maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers']


df = df.assign(byhand = np.nan)
if (debug > 0):
    print('Doing stuff by hand...')

# starts with 807 unique values to look through
df.loc['567003.html', 'byhand'] = ''
df.loc['567194.html', 'byhand'] = 'ACDC'
df.loc['566252.html', 'byhand'] = 'Casting Crowns'
df.loc['567047.html', 'byhand'] = 'Charlie Cunningham'
df.loc['566895.html', 'byhand'] = 'David Mallett Positano'
df.loc['567771.html', 'byhand'] = 'DayWave'
df.loc['c2b7c10aa42aa6f55c8537ece72ed5726ae6b1b3', 'byhand'] = 'DeadCo'
df.loc['566278.html', 'byhand'] = 'Dizzy Mizz Lizzy'
df.loc['567146.html', 'byhand'] = 'Dust Bowl'
df.loc['567271.html', 'byhand'] = 'Editors'
df.loc['566864.html', 'byhand'] = 'Frau, ich freu mich'
df.loc['567469.html', 'byhand'] = 'Front'
df.loc['565793.html', 'byhand'] = 'GORDON GATTON'
df.loc['566951.html', 'byhand'] = "Goin' Down"
df.loc['565793.html', 'byhand'] = 'How Many More Years'
df.loc['566814.html', 'byhand'] = 'JMJarre'
df.loc['566526.html', 'byhand'] = 'Jam'
df.loc['567140.html', 'byhand'] = 'KISS'
df.loc['566136.html', 'byhand'] = 'Karma To Burn'
df.loc['566798.html', 'byhand'] = 'Keith Jarrett'
df.loc['565529.html', 'byhand'] = 'Keith Jarrett'
df.loc['566810.html', 'byhand'] = 'Lucie Vitkova'
df.loc['566756.html', 'byhand'] = 'Marillion'
df.loc['567879.html', 'byhand'] = 'ModernBaseball'
df.loc['566106.html', 'byhand'] = 'Sally Barker'
df.loc['565750.html', 'byhand'] = 'Mutemath'
df.loc['567058.html', 'byhand'] = 'Never Give All Your Heart'
df.loc['567454.html', 'byhand'] = 'No Good Place for the Lonely'
df.loc['567592.html', 'byhand'] = 'OP'
df.loc['566219.html', 'byhand'] = 'PDP'
df.loc['567323.html', 'byhand'] = 'Peter Gabriel'
df.loc['587888.html', 'byhand'] = 'Phish'
df.loc['567639.html', 'byhand'] = 'RRK'
df.loc['566017.html', 'byhand'] = 'Joe Jackson'
df.loc['567758.html', 'byhand'] = 'Radiohead'
df.loc['567760.html', 'byhand'] = 'Radiohead'
df.loc['565628.html', 'byhand'] = 'Red Is The Rose'
df.loc['567795.html', 'byhand'] = 'Rob Brown Quartet'
df.loc['127033.html', 'byhand'] = 'Humble Pie'
df.loc['566724.html', 'byhand'] = 'Slow Train'
df.loc['566879.html', 'byhand'] = 'Slow Train'
df.loc['567156.html', 'byhand'] = 'The Finish Line'
df.loc['566127.html', 'byhand'] = 'The Jam'
df.loc['566155.html', 'byhand'] = 'The Jam'
df.loc['565852.html', 'byhand'] = 'The Obsessed'
df.loc['566119.html', 'byhand'] = 'Iggy Pop'
df.loc['e059e8d39da4b75145b1e434b0569df9c8f59367', 'byhand'] = 'The Ramones'
df.loc['565970.html', 'byhand'] = 'The Waterboys'
df.loc['566231.html', 'byhand'] = 'The_Jam'
df.loc['566157.html', 'byhand'] = 'The_Jam'
df.loc['566144.html', 'byhand'] = 'The_Jam'
df.loc['567566.html', 'byhand'] = 'Tiger in Your Tank'
df.loc['565958.html', 'byhand'] = 'Twenty One Pilots'
df.loc['587769.html', 'byhand'] = 'WSP'
df.loc['566150.html', 'byhand'] = 'Waterboys'
df.loc['566294.html', 'byhand'] = 'Wayne Shorter Quartet'
df.loc['567220.html', 'byhand'] = "Where It's At - Good Times - China Girl - Pocket Calculator - 1999"
df.loc['567368.html', 'byhand'] = 'Woke Up Dreaming'
df.loc['567336.html', 'byhand'] = 'YLT'
df.loc['567438.html', 'byhand'] = 'colt silvers orchstral'
df.loc['565740.html', 'byhand'] = 'davidosky'
df.loc['565934.html', 'byhand'] = 'l-ficher-g-baton'
df.loc['567466.html', 'byhand'] = 'sales'
df.loc['565959.html', 'byhand'] = 'sideshow'

#found in round 2
df.loc['567822.html', 'byhand'] = ''
df.loc['566375.html', 'byhand'] = 'Barry Melton &amp; The Green Ray'
df.loc['3c39e32bf0918ec3190ecda08442eba0d8b2faa8', 'byhand'] = 'BeauSoliel'
df.loc['567813.html', 'byhand'] = ''
df.loc['567240.html', 'byhand'] = 'Boris'
df.loc['587570.html', 'byhand'] = 'BruceHornsby'
df.loc['567534.html', 'byhand'] = ''
df.loc['6fb667c4dedf80bdb7417c9e4ccdf097baa47f34', 'byhand'] = 'CCR'
df.loc['565748.html', 'byhand'] = 'Clawhammer'
df.loc['567442.html', 'byhand'] = 'Wayne Shorter Quartet'
df.loc['566147.html', 'byhand'] = 'Drinks'
df.loc['587360.html', 'byhand'] = 'GSBG'
df.loc['567748.html', 'byhand'] = 'How Many More Years'
df.loc['566546.html', 'byhand'] = 'Jam'
df.loc['566890.html', 'byhand'] = 'Jam'
df.loc['565561.html', 'byhand'] = 'Jimmy Mo Mhile'
df.loc['565752.html', 'byhand'] = ''
df.loc['565900.html', 'byhand'] = ''
df.loc['565900.html', 'byhand'] = 'Radiohead'
df.loc['567775.html', 'byhand'] = 'Savages'
df.loc['567578.html', 'byhand'] = 'Trembling Bells'
df.loc['566973.html', 'byhand'] = 'chess smith trio'
df.loc['e0ebb4678c72816bdcd690e00210885d5b2be043', 'byhand'] = 'jlyys'
df.loc['8f71073ff291fa22657e549bbdff1363ec350a32', 'byhand'] = 'lrb'
df.loc['566695.html', 'byhand'] = 'lrb'
df.loc['566103.html', 'byhand'] = 'neil young'
df.loc['566578.html', 'byhand'] = ''
df.loc['126968.html', 'byhand'] = ''
df.loc['7d66aee7bb5139e86af56f8bb6045f020027bd05', 'byhand'] = 'shiny'
df.loc['0adf1d1ecfe11bf43ccc6140a01d0f0487dd9c9e', 'byhand'] = 'maniacs'

# round 2: still 781 to look through
df.loc['567064.html', 'byhand'] = 'Radiohead'
df.loc['587786.html', 'byhand'] = 'ph'
df.loc['566329.html', 'byhand'] = 'Ministry'
df.loc['565531.html', 'byhand'] = "Maggie's Farm"
df.loc['565891.html', 'byhand'] = 'Nick Cave'
df.loc['567402.html', 'byhand'] = 'Nick Cave'
df.loc['567810.html', 'byhand'] = 'Matthew Shipp'
df.loc['566374.html', 'byhand'] = 'tr'

# round 3: added those from files with no date string
df.loc['565969.html', 'byhand'] = 'Enter Stage'
df.loc[['565558.html', '567162.html'], 'byhand'] = ''
df.loc['566317.html', 'byhand'] = 'Rainy Day Women #12 &amp; 35'
df.loc['566525.html', 'byhand'] = 'Stratovarius'
df.loc['566563.html', 'byhand'] = 'Give Me What I Want And Give It'
df.loc['567333.html', 'byhand'] = 'Silver Machine and Waiting For The Man'
df.loc['127094.html', 'byhand'] = 'Slip Away'
df.loc['566773.html', 'byhand'] = '10CC'
df.loc['567222.html', 'byhand'] = 'Heroes'
df.loc['567222.html', 'byhand'] = "Theme-Esther's Nose Job"
df.loc['1c5dc8fe10c17dcf42a786c638d6107939b81ef1', 'byhand'] = 'Rocket'
df.loc['567110.html', 'byhand'] = "11 O'clock tick tock"
df.loc['566581.html', 'byhand'] = 'Elvin'
df.loc[['565997.html', '567087.html'], 'byhand'] = ''
df.loc['566050.html', 'byhand'] = 'Black Sabbath'
df.loc['567198.html', 'byhand'] = 'Stay Hungry'
df.loc['567153.html', 'byhand'] = "What You Don't Know"
df.loc['566754.html', 'byhand'] = 'AH All Stars'
df.loc['567521.html', 'byhand'] = 'After The Gold Rush'
df.loc['567524.html', 'byhand'] = 'After The Gold Rush'
df.loc['567504.html', 'byhand'] = 'After The Gold Rush'
#df.loc['566920.html', 'byhand'] = 'Apex Predator & Easy Meat'
df.loc['13ee4f3cdfe96b0a6892052b9224df0751ebb3be', 'byhand'] = 'AudioTrack'
df.loc['565549.html', 'byhand'] = 'BC'
df.loc['5a3b55bc30d8dabb1023d442759c34dd70edebb1', 'byhand'] = 'BR'
df.loc['567391.html', 'byhand'] = 'Bell'
df.loc['566929.html', 'byhand'] = 'Bimhuis'
df.loc['566762.html', 'byhand'] = 'BonaStadt'
df.loc[['566967.html', '566991.html'], 'byhand'] = 'Boston'
df.loc['567647.html', 'byhand'] = 'Breathe'
df.loc['566727.html', 'byhand'] = 'Buben'
df.loc['566572.html', 'byhand'] = 'Bul'
df.loc[['566105.html', '127037.html', '127043.html', '127074.html', '127069.html', '127073.html', '127048.html', '566496.html'], 'byhand'] = ''
df.loc['565791.html', 'byhand'] = 'Life on Mars'
df.loc['566867.html', 'byhand'] = 'Comfortably Numb'
df.loc['567361.html', 'byhand'] = 'Coral Creek'
df.loc['565859.html', 'byhand'] = 'Corb Lund'
df.loc['567161.html', 'byhand'] = 'Cue'
df.loc[['565674.html', '565541.html', '567697.html']] = ''
df.loc['567599.html', 'byhand'] = 'Dialogue'
df.loc[['126997.html', '566779.html', '567042.html'], 'byhand'] = ''
df.loc['566903.html', 'byhand'] = 'Dixie Chicks'
df.loc['d1ca063ad51903c84abffda9efea7e2f71de3a4f', 'byhand'] = 'All By Myself'
df.loc['566886.html', 'byhand'] = 'Drones'
df.loc['567208.html', 'byhand'] = 'Drumming'
df.loc['567139.html', 'byhand'] = 'DWZappa'
df.loc['127228.html', 'byhand'] = 'Facelift-Slightly All The Time-Moon In June'
df.loc['566421.html', 'byhand'] = 'Fame'
df.loc['566214.html', 'byhand'] = 'Fashion'
df.loc['567163.html', 'byhand'] = 'FatalitÃ'
df.loc['566175.html', 'byhand'] = 'Fay Hield'
df.loc['566130.html', 'byhand'] = 'Fireworks Medley'
df.loc['566498.html', 'byhand'] = 'Follow Me Through'
df.loc['567234.html', 'byhand'] = 'GNR'
df.loc['566728.html', 'byhand'] = 'GS'
df.loc['565884.html', 'byhand'] = 'Gershwin'
df.loc['567346.html', 'byhand'] = 'Giant'
df.loc['567207.html', 'byhand'] = 'Gregory Porter'
df.loc['567358.html', 'byhand'] = 'HIM'
df.loc['566930.html', 'byhand'] = 'Carla Bley and Steve Swallow'
df.loc['566932.html', 'byhand'] = 'Ches Smith'
df.loc[['567867.html', '566753.html'], 'byhand'] = 'JM'
df.loc['566850.html', 'byhand'] = 'Kate Rusby'
df.loc['566351.html', 'byhand'] = 'Leaping Dance'
df.loc['567202.html', 'byhand'] = 'LK'
df.loc['567734.html', 'byhand'] = 'Linda Thompson'
df.loc[['566170.html', '566743.html'], 'byhand'] = 'MP'
df.loc['566896.html', 'byhand'] = 'Macca'
df.loc['567381.html', 'byhand'] = 'Marker'
df.loc[['566377.html', '567068.html']] = 'Marque'
df.loc['566093.html', 'byhand'] = 'Marriage tree'
df.loc['565938.html', 'byhand'] = 'Matrix'
df.loc['566731.html', 'byhand'] = 'Mr. Ray'
df.loc[['f808ec4b46eb140fca6b9b73f6cbff76ee0e6641', '127023.html'], 'byhand'] = 'Neil Young'
df.loc['565714.html'] = 'Ozzy'
df.loc['565684.html', 'byhand'] = 'PM'
df.loc['567203.html', 'byhand'] = 'Pianotrio'
df.loc[['567364.html', '565826.html'], 'byhand'] = 'R'
df.loc['566560.html', 'byhand'] = 'RF'
df.loc['566844.html', 'byhand'] = 'RTBand'
df.loc['566820.html', 'byhand'] = 'Redford'
df.loc['ebb2e1757bdb2a01716acefca2697fd718d9a8c1', 'byhand'] = 'Rush'
df.loc['567204.html', 'byhand'] = 'SO'
df.loc[['567700.html', '565675.html', '565583.html'], 'byhand'] = 'STE'
df.loc['566808.html', 'byhand'] = 'Seal my Fate'
df.loc['565585.html', 'byhand'] = 'Shoot A Hole Into The Sun'
df.loc['87ec619615924cd0858add14592b79bc0d4a2fad', 'byhand'] = ''
df.loc['126807.html', 'byhand'] = 'Spur'
df.loc['567417.html', 'byhand'] = 'Strauss'
df.loc['565676.html', 'byhand'] = 'Sweet Liberties'
df.loc['587838.html', 'byhand'] = 'T.Flyer'
df.loc['567570.html', 'byhand'] = 'INTRO'
df.loc[['567001.html', '127042.html'], 'byhand'] = ''
df.loc['567704.html', 'byhand'] = 'WE WISH YOU WELL'
df.loc['566882.html', 'byhand'] = 'The_Killer_Is_Me'
df.loc['566320.html', 'byhand'] = 'TTB'
df.loc['ff3fe93793664814e48139fbd51022e73347ec86', 'byhand'] = 'The Ramones'
df.loc['566981.html', 'byhand'] = 'The Seventh Seal'
df.loc[['566550.html', '566557.html'], 'byhand'] = 'Thick As A Brick'
df.loc[['566232.html', '565848.html', '566936.html'], 'byhand'] = ''
df.loc[['566091.html', '566092.html', '566338.html', '566339.html', '566842.html', '566843.html', '567288.html', '567290.html', '567418.html', '567419.html', '126837.html', '126839.html', '126840.html', '126841.html', '126842.html', '126843.html', '126894.html', '126895.html', '127003.html', '127004.html', '127015.html', '127016.html', '127105.html', '127106.html', '127127.html', '127128.html'], 'byhand'] = ''
df.loc['567309.html', 'byhand'] = 'U2'
df.loc[['566801.html', '566806.html', '567759.html', '567768.html', '567783.html', '567844.html', '567846.html'], 'byhand'] = 'Unknown Artist'
df.loc[['565654.html', '565896.html'], 'byhand'] = 'Whitesnake'
df.loc['565901.html', 'byhand'] = 'Whitford'
df.loc['566654.html', 'byhand'] = 'bp'
df.loc[['565591.html', '565972.html', '566117.html', '566333.html', '566456.html', '566552.html', '566623.html', '566904.html', '567032.html', '567106.html', '567175.html', '567806.html', '566345.html', '126936.html'], 'byhand'] = ''
df.loc['567366.html', 'byhand'] = 'costbrist'
df.loc[['567693.html', '567589.html', '566901.html'], 'byhand'] = ''
df.loc['566485.html', 'byhand'] = 'dori theme'
df.loc['567487.html', 'byhand'] = 'flying over the odra valley'
df.loc['566417.html', 'byhand'] = 'interiors'
df.loc['565922.html', 'byhand'] = 'kyrie aus der messe in h-moll'
df.loc['566636.html', 'byhand'] = 'little liza jane'
df.loc['567509.html', 'byhand'] = 'love ship'
df.loc['587704.html', 'byhand'] = ''
df.loc['566982.html', 'byhand'] = 'minor mood'
df.loc['566604.html', 'byhand'] = ''
df.loc['567048.html', 'byhand'] = 'nacht'
df.loc['565929.html', 'byhand'] = 'prometheus'
df.loc['565606.html', 'byhand'] = 'prrp'
df.loc['565981.html', 'byhand'] = 'scollin'
df.loc['edf25ec0ac4fd08874c7627bc527420091ef1747', 'byhand'] = 'intro'
df.loc['567526.html', 'byhand'] = 'sly eyes'
df.loc['567431.html', 'byhand'] = 'the three marias'
df.loc['567478.html', 'byhand'] = 'these are the god days'
df.loc['565966.html', 'byhand'] = 'tribute to keith jarrett'
df.loc['566178.html', 'byhand'] = 'unbenannt'
df.loc['567479.html', 'byhand'] = 'unknown title'
df.loc['ddbaea0b694a4c0deaa8f75156446b8e7adea2d6', 'byhand'] = 'JG MS'
df.loc['567404.html', 'byhand'] = 'JOEJ'
df.loc['567681.html', 'byhand'] = 'JWalsh'
df.loc['565764.html', 'byhand'] = 'Jackson Browne'
df.loc['566933.html', 'byhand'] = 'Mahler'
df.loc['567338.html', 'byhand'] = 'Reptile'
df.loc['566989.html', 'byhand'] = 'Richard Shindell'
df.loc['565694.html', 'byhand'] = 'Ross Miller'
df.loc['566261.html', 'byhand'] = 'Scythia'
df.loc['566350.html', 'byhand'] = 'Ten secret doors'
df.loc['567731.html', 'byhand'] = 'The Brew'
df.loc['566280.html', 'byhand'] = 'Thunderstone'
df.loc['566059.html', 'byhand'] = 'SpunkRock'
df.loc['567062.html', 'byhand'] = 'as'
df.loc[['566707.html', '567167.html', '567505.html'], 'byhand'] = 'ff'
df.loc['566210.html', 'byhand'] = 'ja'
df.loc['566594.html', 'byhand'] = 'kv'
df.loc['587748.html', 'byhand'] = 'mcooleyt'
df.loc['566266.html', 'byhand'] = 'ode to satie'

if (debug > 0):
    print('Edit some values by hand...')
    print('\tFound {0:,.0f} such unique strings in {1:,.0f} filenames...'.format(
        len(df['byhand'].dropna().drop_duplicates()),
        len(df['byhand'].dropna())
    ))
    
df.loc[(df['byhand'].isnull()), 'byhand'] = df['maybe_band_string_no_startnumber_no_td_no_extras_no_endspacers_notrack']

df = df.assign(band_string = np.nan)
df.loc[(df['band_string'].isnull()), 'band_string'] = df['byhand']

df = df.assign(band_string = np.nan)
df.loc[(df['band_string'].isnull()), 'band_string'] = df['byhand']

# round 3: now back to 806 for some reason
print('\nEnding with {0:,.0f} unique band strings encompassing {1:,.0f} filenames!'.format(
    len(df['band_string'].drop_duplicates()),
    len(df['band_string'])
))

if (debug > 0):
    print('\nAssigning band_string_right_before_date_string = True to {0:,.0f} filenames...'.format(
        len(df[df['predate'] == df['band_string']])
    ))
    
df['band_string_right_before_date_string'] = False
df.loc[df['predate'] == df['band_string'], 'band_string_right_before_date_string'] = True
df.loc[df['predate'].isnull(), 'band_string_right_before_date_string'] = False

if (debug > 0):
    print('joining...')
music_df = music_df.join(df[['band_string', 'band_string_right_before_date_string']], how='left')

print('backing up...')
music_df_bk = music_df

print('writing out intermediate data, then dropping...')
df.to_csv(intermediate_datadir + 'band_string_parsing.csv', encoding='utf-8')
df = df.reset_index().iloc[0:0].drop(df.reset_index().columns.tolist(), axis=1)  # trick to kill a dataframe: https://stackoverflow.com/questions/39173992/drop-all-data-in-a-pandas-dataframe
gc.collect()

print('done')
#df['band_string'].drop_duplicates().sort_values()#[1520:]   # DONE UP TO 500!!!!!!!



getting from backup...
If the proposed band name starts with number(s) and the like, strip...
	Found 775 such unique strings in 1,088 filenames...
If the proposed band name starts with d or t and maybe numbers, strip...
	Found 47 such unique strings in 63 filenames...
Look for possible extra detail after band name or other weirdness...
	Found 131 such unique strings in 161 filenames...
Strip ending spacer characters...
	Found 348 such unique strings in 430 filenames...
Doing stuff by hand...
Edit some values by hand...
	Found 210 such unique strings in 320 filenames...

Ending with 1,506 unique band strings encompassing 2,972 filenames!

Assigning band_string_right_before_date_string = True to 1,009 filenames...
joining...
backing up...
writing out intermediate data, then dropping...
done


## Get list of official band abbreviations

1. Get from etree
2. Rotate into a guide file
3. Get archive.org entries also
4. Combine into one guide file

In [7]:
print('reading etree abbreviations...')
abbrev_df = pandas.read_csv('data/abbreviations.csv')
abbrev_df = abbrev_df.fillna('')
abbrev_df = abbrev_df.set_index('rownumber')

canonical_abbrevs = abbrev_df['abbrev'][abbrev_df['abbrev'] != ''].drop_duplicates().tolist()
#canonical_abbrevs

# Get variant abbreviations
variant_abbrevs = []
for x in abbrev_df['otherabbrev'].drop_duplicates().astype('str'):
    if (',' in x):
        for y in x.split(','):
            variant_abbrevs.append(y.strip())
    else:
        if (x.strip() != ''):
            variant_abbrevs.append(x.strip())

# Fix wilco by moving it from variant to canonical
canonical_abbrevs.append('wilco')
variant_abbrevs.remove('wilco')

print('Parsed {0:,.0f} canonical abbreviations and {1:,.0f} variant abbreviations from etree...'.format(len(canonical_abbrevs), len(variant_abbrevs)))
abbrev_df.sample(3)


# ROTATE ABBREVIATIONS INTO GUIDE FILE
print('rotating etree abbreviations into guide file...')
guide_df = pandas.DataFrame(data=abbrev_df['abbrev'][abbrev_df['abbrev'] != ''].drop_duplicates(), columns=['abbrev'])
guide_df = guide_df.merge(abbrev_df, how='left', on='abbrev')
guide_df = guide_df[['abbrev', 'name']]
guide_df.loc[guide_df['abbrev'] == '(see fb below)', 'abbrev'] = 'claypool'
guide_df = guide_df[guide_df['name'] != 'Willy Porter Band']

guide_df = guide_df.assign(canonical = True)
guide_df = guide_df.set_index('abbrev')
guide_df = guide_df.sort_index()
#guide_df

#for thisabbrev in variant_abbrevs:
#    print(thisabbrev, abbrev_df['name'][abbrev_df['otherabbrev'].apply(lambda x: thisabbrev in x)].values)
for thisrow in abbrev_df['otherabbrev'][abbrev_df['otherabbrev'] != ''].drop_duplicates().tolist():
    for thisabbrev in thisrow.split(','):
        thisabbrev = thisabbrev.strip()
        thisband = abbrev_df['name'][abbrev_df['otherabbrev'].apply(lambda x: thisabbrev in x)].values[0]
        if thisabbrev not in guide_df.index.values:
            guide_df.loc[thisabbrev] = [thisband, False]

guide_df = guide_df.sort_index()

### change index value of kdub
indexaslist = guide_df.index.tolist()
kdubvalue = indexaslist.index('kdub (for k-double-u)')
indexaslist[kdubvalue] = 'kdub'
stringcheesevalue = indexaslist.index('keller williams w- string cheese incident')
indexaslist[stringcheesevalue] = 'kwi'
guide_df.index = indexaslist
guide_df.index.name = 'abbrev'


## Clean up some weird-ass values
guide_df.loc['beanland', 'name'] = 'Beanland'
guide_df.loc['buho', 'canonical'] = True
guide_df.loc['logic', 'canonical'] = True
guide_df.loc['gtb', 'canonical'] = True
guide_df.loc['ig', 'canonical'] = True
guide_df.loc['jgb', 'name'] = 'Jerry Garcia Band'
guide_df.loc['jgb', 'canonical'] = True
guide_df.loc['jt', 'canonical'] = True
guide_df.loc['ls', 'canonical'] = True
guide_df.loc['los', 'canonical'] = True
guide_df.loc['marlow', 'canonical'] = True
guide_df.loc['moe', 'name'] = 'Moe'
guide_df.loc['pmb', 'canonical'] = True
guide_df.loc['amendola', 'canonical'] = True
guide_df.loc['hip', 'canonical'] = True
guide_df.loc['wilco', 'canonical'] = True
guide_df.loc['spod', 'name'] = 'Serial Pod'

guide_df.index.name = 'band_string'
guide_df = guide_df.rename(columns={'name': 'band'})
guide_df = guide_df.assign(abbrev_type = np.nan)
guide_df.loc[guide_df['canonical'] == True, 'abbrev_type'] = 'canonical'
guide_df.loc[guide_df['canonical'] == False, 'abbrev_type'] = 'variant etree'
guide_df = guide_df.drop('canonical', axis=1)

print('Retrieving from archive.org...')
pageurl = 'https://archive.org/audio/etree-band-abbrevs.php'
page = urllib.request.urlopen(pageurl).read()

soup = bs(page, "html.parser")

tables = soup.find_all('table')

bandtable = tables[0]
bands = bandtable.find_all('tr')
allbands = []

print('Thinking...')
for thisband in bands[2:]:
    #pprint(thisband)
    theirdata = thisband.find_all('td')
    #pprint(theirdata)
    oneband = {}
    oneband['name'] = theirdata[0].text
    oneband['abbrevs'] = theirdata[1].text
    allbands.append(oneband)

archive_org_abbrevs_df = pandas.DataFrame.from_records(allbands)
archive_org_abbrevs_df = archive_org_abbrevs_df.set_index('name')

archive_org_abbrevs_df = archive_org_abbrevs_df.assign(abbrev_list = '')
for idx, thisrow in archive_org_abbrevs_df.iterrows():
    archive_org_abbrevs_df.loc[idx, 'abbrev_list'] = [x.lower().strip() for x in archive_org_abbrevs_df['abbrevs'].loc[idx].split(';')]
    
allabbrevdict = {}
for idx, thisrow in archive_org_abbrevs_df.iterrows():
    for i in range(0, len(thisrow['abbrev_list'])):
        allabbrevdict[thisrow['abbrev_list'][i]] = idx
#allabbrevdict
archive_org_guide_df = pandas.DataFrame(list(allabbrevdict.items()), columns=['abbrev', 'band_archive_org'])
archive_org_guide_df = archive_org_guide_df.set_index('abbrev')
archive_org_guide_df.index.name = 'band_string'
archive_org_guide_df = archive_org_guide_df.drop('', axis=0)
#archive_org_guide_df = archive_org_guide_df.sort_index()
#archive_org_guide_df = archive_org_guide_df.reset_index().set_index('abbrev').sort_index()
archive_org_guide_df = archive_org_guide_df.rename(columns = {'band_archive_org': 'band'})
archive_org_guide_df = archive_org_guide_df.assign(abbrev_type = 'variant archive.org')

print('Parsed {0:,.0f} variant abbreviations from archive.org...'.format(len(archive_org_guide_df)))
print('Removing {0:,.0f} rows of archive.org duplicated from etree...'.format(len(archive_org_guide_df[archive_org_guide_df.index.isin(guide_df.index.tolist())])))
archive_org_guide_df = archive_org_guide_df.loc[~archive_org_guide_df.index.isin(guide_df.index.tolist())]

print('combining...')
guide_df = guide_df.append(archive_org_guide_df)
print('Found {0:,.0f} band strings for guide file!'.format(len(guide_df)))
guide_df = guide_df.sort_index()

guide_df.groupby('abbrev_type').size()


reading etree abbreviations...
Parsed 156 canonical abbreviations and 61 variant abbreviations from etree...
rotating etree abbreviations into guide file...
Retrieving from archive.org...
Thinking...
Parsed 2,821 variant abbreviations from archive.org...
Removing 118 rows of archive.org duplicated from etree...
combining...
Found 2,916 band strings for guide file!


abbrev_type
canonical              169 
variant archive.org    2703
variant etree          44  
dtype: int64

# Figure out which band names are canonical, incorrect, etc.

In [33]:
print('retrieving from backup...')
music_df = music_df_bk

# Getting band strings
df = pandas.DataFrame()

print('joining archive guide file to band name file...')
df = df.assign(band_string = music_df['band_string'])
df = df.reset_index().merge(guide_df.reset_index(), how='left').set_index('id')

df.loc[df['band_string'] == '', ['band', 'abbrev_type']] = [np.nan, 'blank']

print('Getting band names for bands in the etree and archive.org list...')
lookup_and_assign = []
lookup_and_assign += [{'band_string': 'AS', 'band': 'Acoustic Syndicate', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'BB', 'band': 'Brother Bean', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'BOB', 'band': 'Breathe Owl Breathe', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'BeauSoliel',  'band': 'JP Beausoleil', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'BelaFleckFlecktones', 'band': 'Bela Fleck & The Flecktones', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'BruceHornsby', 'band': 'Bruce Hornsby', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'CS', 'band': 'Captain Soularcat', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'DGBX', 'band': "Darwin's Grab Bag", 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'DZ', 'band': 'David Zollo and the Body Electric', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'GD', 'band': 'Grateful Dead', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'Gov`t Mule', 'band': 'Government Mule', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'GSBG', 'band': 'Greensky Bluegrass', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'IT', 'band': 'In Trasnit', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'Jefferson Starship', 'band': 'Jefferson Starship', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'Jefferson Startship', 'band': 'Jefferson Starship', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'JJ', 'band': 'Jack Johnson', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'LS', 'band': 'Leftover Salmon', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'Matisyahu', 'band': 'Matisyahu', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'MMW', 'band': 'Medeski, Martin, & Wood', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'NS', 'band': 'Nayas', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'OP', 'band': 'Owen Plant', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'PF', 'band': 'Phil Lesh & Friends', 'abbrev_type': 'variant etree'}]
lookup_and_assign += [{'band_string': 'PH', 'band': 'Phish', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'PJ', 'band': 'Pearl Jam', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'PL&amp;F', 'band': 'Phil Lesh & Friends', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'Phish', 'band': 'Phish', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'SB', 'band': 'Seismic', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'SS', 'band': 'Stockholm Syndrome', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'Standing O-Tenacious D', 'band': 'Tenacious D', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'TAB', 'band': 'Trey Anastasio (Band)', 'abbrev_type': 'variant etree'}]
lookup_and_assign += [{'band_string': 'TL', 'band': 'Toby Lightman', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'TP', 'band': 'Tristan Prettyman', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'TR', 'band': 'Tim Reynolds', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'TTS', 'band': 'Two Ton Shoe', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'Tenacious_D', 'band': 'Tenacious D', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'W.Haynes', 'band': 'Warren Haynes', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'WSP', 'band': 'Widespread Panic', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'Ween', 'band': 'Ween', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'Wilco', 'band': 'Wilco', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'YMSB', 'band': 'Yonder Mountain String Band', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'Zero', 'band': 'Zero', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'belafleckandtheflecktones', 'band': 'Bela Fleck & The Flecktones', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'deadfeat', 'band': 'Grateful Dead', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'delmccouryband', 'band': 'The Del McCoury Band', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'greenskybluegrass', 'band': 'Greensky Bluegrass', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'haynes_warren', 'band': 'Warren Haynes', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'jackie', 'band': 'Jackie Blue', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'lake', 'band': 'Lake Trout', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'pearljam', 'band': 'Pearl Jam', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'ph&amp;tl', 'band': 'Phish', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'phish', 'band': 'Phish', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'rh', 'band': 'Robyn Hitchcock', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'tobylightman', 'band': 'Toby Lightman', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'warrenhaynes', 'band': 'Warren Haynesw', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'widespreadpanic', 'band': 'Widespread Panic', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'zak', 'band': 'Zak Winnick', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'BC', 'band': 'Black Crowes', 'abbrev_type': 'canonical'}]
lookup_and_assign += [{'band_string': 'BR', 'band': 'Benevento/Russo', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'Burning Spear', 'band': 'Burning Spear', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'GS', 'band': 'Greenstreet', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'HW', 'band': 'Honey White', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'JM', 'band': 'Jason Mraz', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'MP', 'band': 'Mustard Plug', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'PM', 'band': 'Pat Metheny', 'abbrev_type': 'variant etree'}]
lookup_and_assign += [{'band_string': 'SO', 'band': 'Stolen Ogre', 'abbrev_type': 'variant archive.org'}]
lookup_and_assign += [{'band_string': 'T.Flyer', 'band': 'Terrapin Flyer', 'abbrev_type': 'incorrect'}]
lookup_and_assign += [{'band_string': 'Think', 'band': 'Think', 'abbrev_type': 'variant archive.org'}]

for thisband in lookup_and_assign:
    df.loc[df['band_string'] == thisband['band_string'], 'band'] = thisband['band']
    df.loc[df['band_string'] == thisband['band_string'], 'abbrev_type'] = thisband['abbrev_type']

print('Getting band names for bands NOT in the etree and archive.org list...')
lookup = []
lookup += [{'band_string': 'ACDC', 'band': 'AC/DC'}]
lookup += [{'band_string': 'AM', 'band': 'Against Me!'}]
lookup += [{'band_string': 'Abercrombie', 'band': 'John Abercrombie'}]
lookup += [{'band_string': 'Albert Collins _ the Icebreakers', 'band': 'Albert Collins & the Icebreakers'}]
lookup += [{'band_string': 'Ali_Akbar_Khan-Zakir_Hussain', 'band': 'Ali Akbar Khan & Zakir Hussain'}]
lookup += [{'band_string': 'AmjadAliKhan', 'band': 'Amjad Ali Khan'}]
lookup += [{'band_string': 'AnaPopovic', 'band': 'Ana Popovic'}]
lookup += [{'band_string': 'Anders', 'band': 'Anders Osborne'}]
lookup += [{'band_string': 'AnindyaBanerjeeBobbySingh', 'band': 'Anindya Banerjee & Bobby Singh'}]
lookup += [{'band_string': 'Avett Brothers', 'band': 'The Avett Brothers'}]
lookup += [{'band_string': 'BBCPhil', 'band': 'BBC Philharmonic'}]
lookup += [{'band_string': 'BBCSO', 'band': 'BBC Symphony Orchestra'}]
lookup += [{'band_string': 'BECK', 'band': 'Beck'}]
lookup += [{'band_string': 'BUDDY GUY', 'band': 'Buddy Guy'}] 
lookup += [{'band_string': 'Barry Melton &amp; The Green Ray', 'band': 'Barry Melton & The Green Ray'}]
lookup += [{'band_string': 'BaselSO', 'band': 'Basel Symphony Orchestra'}]
lookup += [{'band_string': 'Bears Den', 'band': "Bears' Den", 'abbrev_type': 'notfound'}]
lookup += [{'band_string': 'BelaFleck_ChrisThile', 'band': 'Bela Fleck & Chris Thile'}]
lookup += [{'band_string': 'BenSparacoBand', 'band': 'Ben Sparaco Band'}]
lookup += [{'band_string': 'BerlinPhilO', 'band': 'Berlin Philharmonic Orchestra'}]
lookup += [{'band_string': 'BoF', 'band': 'Andrew Bird'}]
lookup += [{'band_string': 'BoH', 'band': 'Band of Horses'}]
lookup += [{'band_string': 'BobbySingh', 'band': 'Bobby Singh'}]
lookup += [{'band_string': 'BorromeoQuartet', 'band': 'Borromeo Quartet'}]
lookup += [{'band_string': 'BostonPops', 'band': 'Boston Pops'}]
lookup += [{'band_string': 'Boston_Pops', 'band': 'Boston Pops', 'abbrev_type': 'notfound'}]
lookup += [{'band_string': 'BrothersComatose', 'band': 'Brothers Comatose'}]
lookup += [{'band_string': 'Byrds&amp;Flying Burrito Bros', 'band': 'The Byrds & The Flying Burrito Brothers'}]
lookup += [{'band_string': 'CCR', 'band': 'Creedence Clearwater Revival'}]
lookup += [{'band_string': 'Cats Eyes', 'band': 'Cats\' Eyes', 'abbrev_type': 'notfound'}]
lookup += [{'band_string': 'Claypool_Lennon', 'band': 'The Claypool Lennon Delerium'}]
lookup += [{'band_string': 'Cleverlys', 'band': 'The Cleverlys', 'abbrev_type': 'notfound'}]
lookup += [{'band_string': 'CometControl', 'band': 'Comet Control'}]
lookup += [{'band_string': 'Cris Jacobs &amp; Anders Osborne', 'band': 'Cris Jacobs & Anders Osborne'}]
lookup += [{'band_string': 'DAD', 'band': 'D.A.D.'}]
lookup += [{'band_string': 'DANGER DANGER', 'band': 'Danger Danger'}]
lookup += [{'band_string': 'DEADCo', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'DL&amp;Q', 'band': 'Doyle Lawson & Quicksilver'}]
lookup += [{'band_string': 'DaCaptainTrips', 'band': 'Da Captain Trips'}]
lookup += [{'band_string': 'David Mallett Positano', 'band': 'David Mallet'}]
lookup += [{'band_string': 'DayWave', 'band': 'Daywave'}]
lookup += [{'band_string': 'Dead and Company', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'Dead&Co', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'Dead&amp;Company', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'DeadAndCompany', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'DeadCo', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'DeadandCo', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'DeadandCompany', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'Debashish', 'band': 'Debashih Bhattacharya'}]
lookup += [{'band_string': 'DebashishBhattacharya', 'band': 'Debashih Bhattacharya'}]
lookup += [{'band_string': 'Dust Bowl', 'band': 'Dust Bowl Revival'}]
lookup += [{'band_string': 'Dylan', 'band': 'Bob Dylan'}]
lookup += [{'band_string': 'EC', 'band': 'Exit Clov'}]
lookup += [{'band_string': 'EHG', 'band': 'Eyehategod'}]
lookup += [{'band_string': 'ElectricWizard', 'band': 'Electric Wizard'}]
lookup += [{'band_string': 'EricBurdon', 'band': 'Eric Burdon'}]
lookup += [{'band_string': 'Glen HANSARD', 'band': 'Glen Hansard'}]
lookup += [{'band_string': 'GomerPyle', 'band': 'Gomer Pyle'}]
lookup += [{'band_string': 'Gulcan_Kaya', 'band': 'Gulcan Kaya'}]
lookup += [{'band_string': 'HaasKowertTice', 'band': 'Haas Kowert Tice'}]
lookup += [{'band_string': 'HarryManx', 'band': 'Harry Manx'}]
lookup += [{'band_string': 'Henry&amp;Martin', 'band': 'Henry & Martin'}]
lookup += [{'band_string': 'Hidalgo_Perez', 'band': 'Hidalgo Perez'}]
lookup += [{'band_string': 'HristoVitchev', 'band': 'Hristo Vitchev'}]
lookup += [{'band_string': 'IronMaiden', 'band': 'Iron Maiden'}]
lookup += [{'band_string': 'JMJarre', 'band': 'Jean-Michel Jarre'}]
lookup += [{'band_string': 'JMedeskiMadSkillet', 'band': "John Medeski's Mad Skillet"}]
lookup += [{'band_string': 'JOE WALSH', 'band': 'Joe Walsh'}]
lookup += [{'band_string': 'Jam', 'band':'The Jam'}]
lookup += [{'band_string': 'James Gang', 'band':'The James Gang'}]
lookup += [{'band_string': 'Joao Gilberto', 'band':'João Gilberto'}]
lookup += [{'band_string': 'JoeJackson', 'band':'Joe Jackson'}]
lookup += [{'band_string': 'JoeEly', 'band':'Joe Ely'}]
lookup += [{'band_string': 'Joewarner', 'band':'Joe Warner'}]
lookup += [{'band_string': 'JohnnyWinter', 'band':'Johnny Winter'}]
lookup += [{'band_string': 'Jonas Blue and JP Cooper', 'band':'Jonas Blue & J.P. Cooper'}]
lookup += [{'band_string': 'KamasiWashington', 'band':'Kamasi Washington'}]
lookup += [{'band_string': 'KatieMelua', 'band':'Katie Melua'}]
lookup += [{'band_string': 'Keith jarrett', 'band':'Keith Jarrett'}]
lookup += [{'band_string': 'LOTNC', 'band':'Lords of the New Church'}]
lookup += [{'band_string': 'Larry_McCray', 'band':'Larry McCray'}]
lookup += [{'band_string': 'LeavesEyes', 'band':"Leaves' Eyes"}]
lookup += [{'band_string': 'Lifeofagony', 'band':'Life of Agony'}]
lookup += [{'band_string': 'LionShepherd', 'band':'Lion Shepherd'}]
lookup += [{'band_string': 'LondonSO', 'band': 'London Symphony Orchestra'}]
lookup += [{'band_string': 'LosLobos', 'band': 'Los Lobos'}]
lookup += [{'band_string': 'MAKU_ Soundsystem', 'band': 'M.A.K.U. Soundsystem'}]
lookup += [{'band_string': 'MS_JGB', 'band': 'Merl Saunders & the Jerry Garcia Band'}]
lookup += [{'band_string': 'MandolinOrange', 'band': 'Mandolin Orange'}]
lookup += [{'band_string': 'MarkLaneganBand', 'band': 'Mark Lanegan Band'}]
lookup += [{'band_string': 'McCartney', 'band': 'Paul McCartney'}]
lookup += [{'band_string': 'MickTaylor', 'band': 'Mick Taylor'}]
lookup += [{'band_string': 'MimiJonesCamilleThurmanQuartet', 'band': 'The Mimi Jones & Camille Thurman Quartet'}]
lookup += [{'band_string': 'ModernBaseball', 'band': 'Modern Baseball'}]
lookup += [{'band_string': 'MonteMontgomery', 'band': 'Monte Montgomery'}]
lookup += [{'band_string': 'MorsePortnoy', 'band': 'Morse Portnoy'}]
lookup += [{'band_string': 'MostlyAutumn', 'band': 'Mostly Autumn'}]
lookup += [{'band_string': 'MotherEngine', 'band': 'Mother Engine'}]
lookup += [{'band_string': 'MumfordAndSons', 'band': 'Mumford And Sons'}]
lookup += [{'band_string': 'MySleepingKarma', 'band': 'My Sleeping Karma'}]
lookup += [{'band_string': 'NY', 'band': 'Neil Young'}]
lookup += [{'band_string': 'NeilYoung', 'band': 'Neil Young'}]
lookup += [{'band_string': 'NickCave', 'band': 'Nick Cave'}]
lookup += [{'band_string': 'OtisRush', 'band': 'Otis Rush'}]
lookup += [{'band_string': 'PDP', 'band': 'P.D.P.'}]
lookup += [{'band_string': 'Peter &amp; the Test Tube Babies', 'band': 'Peter & the Test Tube Babies'}]
lookup += [{'band_string': 'Peter, Bjorn &amp; John', 'band': 'Peter, Bjorn & John'}]
lookup += [{'band_string': 'Philharmonia_Orch', 'band': 'Philharmonia Orchestra'}]
lookup += [{'band_string': 'PoWoW', 'band': 'Pow Wow'}]
lookup += [{'band_string': 'Polica', 'band': 'Poliça'}]
lookup += [{'band_string': 'PowderForPigeons', 'band': 'Powder For Pigeons'}]
lookup += [{'band_string': 'PrairieHomeCompanion', 'band': 'The Prairie Home Companion'}]
lookup += [{'band_string': 'Pretenders', 'band': 'The Pretenders'}]
lookup += [{'band_string': 'RAMONES', 'band': 'The Ramones'}]
lookup += [{'band_string': 'REM', 'band': 'R.E.M.'}]
lookup += [{'band_string': 'RHCP', 'band': 'Red Hot Chili Peppers'}]
lookup += [{'band_string': 'RONiiA', 'band': 'Roniia'}]
lookup += [{'band_string': 'RRK', 'band': 'Red Red Krovvy'}]
lookup += [{'band_string': 'Rag N Bone Man', 'band': "Rag 'n' Bone Man"}]
lookup += [{'band_string': 'Ralph_Stanley', 'band': 'Ralph Stanley'}]
lookup += [{'band_string': 'Ramones', 'band': 'The Ramones'}]
lookup += [{'band_string': 'RasaDuende', 'band': 'Rasa Duende'}]
lookup += [{'band_string': 'RaviShankar', 'band': 'Ravi Shankar'}]
lookup += [{'band_string': 'Ravi_Shankar', 'band': 'Ravi Shankar'}]
lookup += [{'band_string': 'Richard Galliano &amp; Philip Catherine', 'band': 'Richard Galliano & Philip Catherine'}]
lookup += [{'band_string': 'RoToR', 'band': 'Rotor'}]
lookup += [{'band_string': 'Rob Brown Quartet', 'band': 'The Rob Brown Quartet'}]
lookup += [{'band_string': 'S Marriott', 'band': 'Steve Marriott'}]
lookup += [{'band_string': 'SNC', 'band': 'Straight No Chaser'}]
lookup += [{'band_string': 'SSJ', 'band': 'Southside Johnny'}]
lookup += [{'band_string': 'SamBush', 'band': 'Sam Bush'}]
lookup += [{'band_string': 'SangeetMishraBobbySingh', 'band': 'Sangeet Mishra & Bobby Singh'}]
lookup += [{'band_string': 'SarahPeacock', 'band': 'Sarah Peacock'}]
lookup += [{'band_string': 'SevenThatSpells', 'band': 'Seven That Spells'}]
lookup += [{'band_string': 'Slydogs', 'band': 'Sly Dogs'}]
lookup += [{'band_string': 'SomaliYachtClub', 'band': 'Somali Yacht Club'}]
lookup += [{'band_string': 'SonataArctica', 'band': 'Sonata Arctica'}]
lookup += [{'band_string': 'StingGabriel', 'band': 'Sting & Peter Gabriel'}]
lookup += [{'band_string': 'Strgdstrs', 'band': 'The Infamous Stringdusters'}]
lookup += [{'band_string': 'Stu_Allen_Mars_Hotel', 'band': 'Stu Allen & Mars Hotel'}]
lookup += [{'band_string': 'TA', 'band': 'Transamerica'}]
lookup += [{'band_string': 'TTB', 'band': 'Tedeschi Trucks Band'}]
lookup += [{'band_string': 'Tegan &amp; Sara', 'band': 'Tegan & Sara'}]
lookup += [{'band_string': 'TheBathSaltZombies', 'band': 'The Bath Salt Zombies'}]
lookup += [{'band_string': 'TheClash', 'band': 'The Clash'}]
lookup += [{'band_string': 'TheDyes', 'band': 'The Dyes'}]
lookup += [{'band_string': 'TheJayhawks', 'band': 'The Jayhawks'}]
lookup += [{'band_string': 'The_Jam', 'band': 'The Jam'}]
lookup += [{'band_string': 'Tom Petty &amp; Mudcrutch', 'band': 'Tom Petty & Mudcrutch'}]
lookup += [{'band_string': 'TonerLow', 'band': 'Toner Low'}]
lookup += [{'band_string': 'TruckstopHoneymoon', 'band': 'Truckstop Honeymoon'}]
lookup += [{'band_string': 'VAN MORRISON', 'band': 'Van Morrison'}]
lookup += [{'band_string': 'VM', 'band': 'The Virginmarys'}]
lookup += [{'band_string': 'WAY', 'band': 'The Way'}]
lookup += [{'band_string': 'Wasilewski', 'band': 'The Marcin Wasilewski Trio'}]
lookup += [{'band_string': 'Waterboys', 'band': 'The Waterboys'}]
lookup += [{'band_string': 'Wayne Shorter Quartet', 'band': 'The Wayne Shorter Quartet'}]
lookup += [{'band_string': 'WeHuntBuffalo', 'band': 'We Hunt Buffalo'}]
lookup += [{'band_string': 'WoFat', 'band': 'Wofat'}]
lookup += [{'band_string': 'YLT', 'band': 'Yo La Tengo'}]
lookup += [{'band_string': 'aaronkamm', 'band': 'Aaron Kamm'}]
#lookup += [{'band_string': 'america', 'band': 'America'}]
lookup += [{'band_string': 'arcs', 'band': 'The Arcs'}]
lookup += [{'band_string': 'bbking', 'band': 'B.B. King'}]
lookup += [{'band_string': 'bd', 'band': 'Bob Dylan'}]
lookup += [{'band_string': 'biffyclyro', 'band': 'Biffy Clryo'}]
lookup += [{'band_string': 'big.daddy.wilson', 'band': 'Big Daddy Wilson'}]
lookup += [{'band_string': 'bigwreck', 'band': 'Big Wreck'}]
#lookup += [{'band_string': 'birdman', 'band': 'Birdman'}]
lookup += [{'band_string': 'bob collins + full nelson', 'band': 'Bob Collins & Full Nelson'}]
lookup += [{'band_string': 'bobdylan', 'band': 'Bob Dylan'}]
lookup += [{'band_string': 'bonjovi', 'band': 'Bon Jovi'}]
lookup += [{'band_string': 'bonnieraitt', 'band': 'Bonnie Raitt'}]
lookup += [{'band_string': 'bookertjones', 'band': 'Booker T. Jones'}]
#lookup += [{'band_string': 'boris', 'band': 'Boris'}]
lookup += [{'band_string': 'braitt', 'band': 'Bonnie Raitt'}]
lookup += [{'band_string': 'brandicarlile', 'band': 'Brandi Carlile'}]
lookup += [{'band_string': 'bromberg', 'band': 'Brett Romberg'}]
lookup += [{'band_string': 'bstenson', 'band': 'Bryan Stenson'}]
#lookup += [{'band_string': 'bush', 'band': 'Bush'}]
lookup += [{'band_string': 'cbley', 'band': 'Carla Bley'}]
lookup += [{'band_string': 'ccde', 'band': 'Chinese Connection'}]
lookup += [{'band_string': 'charlie_hunter_trio', 'band': 'Charlie Hunter Trio'}]
lookup += [{'band_string': 'cheaptrick', 'band': 'Cheap Trick'}]
lookup += [{'band_string': 'chess smith trio', 'band': 'Ches Smith Trio'}]
lookup += [{'band_string': 'chevelle', 'band': 'Chevelle'}]
lookup += [{'band_string': 'chiefbroom', 'band': 'Chief Broom'}]
lookup += [{'band_string': 'claim', 'band': 'The Claim'}]
lookup += [{'band_string': 'colt silvers orchstral', 'band': 'Colt Silvers Orchestral'}]
lookup += [{'band_string': 'crb', 'band': 'Chris Robinson Brotherhood'}]
#lookup += [{'band_string': 'creeper', 'band': 'Creeper'}]
lookup += [{'band_string': 'cure', 'band': 'The Cure'}]
lookup += [{'band_string': 'dac', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'dandc', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'dank', 'band': 'The Dank'}]
lookup += [{'band_string': 'danlebo', 'band': 'Dan Lebo'}]
lookup += [{'band_string': 'davidgilmour', 'band': 'David Gilmour'}]
lookup += [{'band_string': 'davidrawlingsmachine', 'band': 'The David Rawlings Machine'}]
lookup += [{'band_string': 'dawes', 'band': 'Dawes'}]
lookup += [{'band_string': 'dc&gn', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'dead&amp;co', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'dead.and.co', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'deadandco', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'deadandcompany', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'deadco', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'deanmonkey&amp;thedropouts', 'band': 'Dean Monkey & The Dropouts'}]
lookup += [{'band_string': 'deanmonkey&thedropouts', 'band': 'Dean Monkey & The Dropouts'}]
lookup += [{'band_string': 'desertdwellers', 'band': 'Desert Dwellers'}]
lookup += [{'band_string': 'dirk.darmstaedter', 'band': 'Dirk Darmstaedter'}]
lookup += [{'band_string': 'dirty_three', 'band': 'Dirty Three'}]
lookup += [{'band_string': 'dml', 'band': 'DangerousMuteLunatic'}]
lookup += [{'band_string': 'dnc', 'band': 'Dead & Company'}]
lookup += [{'band_string': 'dubapocalypse', 'band': 'Dubapocalypse'}]
lookup += [{'band_string': 'dustbowlrevival', 'band': 'Dust Bowl Revival'}]
lookup += [{'band_string': 'dylan', 'band': 'Bob Dylan'}]
lookup += [{'band_string': 'ekb', 'band': 'Eric Krasno Band'}]
lookup += [{'band_string': 'elephantrevival', 'band': 'Elephant Revival'}]
lookup += [{'band_string': 'elh', 'band': "Einstein's Little Homunculus"}]
lookup += [{'band_string': 'elise', 'band': 'Elise Testone'}]
lookup += [{'band_string': 'elisetestone', 'band': 'Elise Testone'}]
lookup += [{'band_string': 'eltonjohn', 'band': 'Elton John'}]
lookup += [{'band_string': 'elviscostello', 'band': 'Elvis Costello'}]
lookup += [{'band_string': 'emmylou', 'band': 'Emmy Lou Harris'}]
lookup += [{'band_string': 'emmylouharris', 'band': 'Emmy Lou Harris'}]
lookup += [{'band_string': 'ericburdon', 'band': 'Eric Burdon'}]
lookup += [{'band_string': 'erickrasnoband', 'band': 'Eric Krasno Band'}]
lookup += [{'band_string': 'erikdeutsch', 'band': 'Erik Deutsch'}]
lookup += [{'band_string': 'fingereleven', 'band': 'Finger Eleven'}]
lookup += [{'band_string': 'frankcarter', 'band': 'Frank Carter'}]
lookup += [{'band_string': 'friendsofthefamily', 'band': 'Friends of the Family'}]
lookup += [{'band_string': 'fuguemill', 'band': 'Fugue Mill'}]
lookup += [{'band_string': 'funkfellowship', 'band': 'Funk Fellowship'}]
lookup += [{'band_string': 'futurememory', 'band': 'Future Memory'}]
lookup += [{'band_string': 'ga&rd', 'band': 'Georgia Avenue & Ratdog'}]
lookup += [{'band_string': 'gaf', 'band': 'Greg Allman & Friends'}]
lookup += [{'band_string': 'godoys', 'band': 'The Godoys'}]
lookup += [{'band_string': 'grahamnash', 'band': 'Graham Nash'}]
lookup += [{'band_string': 'gnr', 'band': "Guns 'n' Roses"}]
lookup += [{'band_string': 'gurf.morlix', 'band': 'Gurf Morlix'}]
lookup += [{'band_string': 'halfmoon', 'band': 'Half Moon'}]
lookup += [{'band_string': 'hayley', 'band': 'Hayley Jane'}]
lookup += [{'band_string': 'hayleyjane', 'band': 'Hayley Jane'}]
lookup += [{'band_string': 'hazmat.modine', 'band': 'Hazmat Modine'}]
lookup += [{'band_string': 'headforthehills', 'band': 'Head for the Hills'}]
lookup += [{'band_string': 'hendryx-carbone-zeiner', 'band': 'Hendryx, Carbone, & Zeiner'}]
lookup += [{'band_string': 'hounddog', 'band': 'Hound Dog'}]
lookup += [{'band_string': 'houseband', 'band': 'House Band'}]
lookup += [{'band_string': 'hueylewis', 'band': 'Huey Lewis'}]
lookup += [{'band_string': 'hugo largo', 'band': 'Hugo Largo'}]
lookup += [{'band_string': 'ian mcculloch', 'band': 'Ian McCulloch'}]
lookup += [{'band_string': 'inyang', 'band': 'Inyang Massey'}]
lookup += [{'band_string': 'jamemurrellstanley', 'band': 'Jane Murrell Stanley'}]
lookup += [{'band_string': 'jeffbeck', 'band': 'Jeff Beck'}]
lookup += [{'band_string': 'jefftobias', 'band': 'Jeff Tobias'}]
lookup += [{'band_string': 'jerryjoseph', 'band': 'Jerry Joseph'}]
lookup += [{'band_string': 'jg&ms', 'band': 'Jerry Garcia & Merl Saunders'}]
lookup += [{'band_string': 'jg+ms', 'band': 'Jerry Garcia & Merl Saunders'}]
lookup += [{'band_string': 'jg-ms', 'band': 'Jerry Garcia & Merl Saunders'}]
lookup += [{'band_string': 'jgms', 'band': 'Jerry Garcia & Merl Saunders'}]
lookup += [{'band_string': 'jim mccarthy', 'band': 'Jim McCarthy'}]
lookup += [{'band_string': 'jjbh', 'band': 'Joan Jett & The Blackhearts'}]
lookup += [{'band_string': 'joejackson', 'band': 'Joe Jackson'}]
lookup += [{'band_string': 'johnprine', 'band': 'John Prine'}]
lookup += [{'band_string': 'josborne', 'band': 'Joan Osborne'}]
lookup += [{'band_string': 'jpj', 'band': 'John Paul Jones'}]
lookup += [{'band_string': 'jsurman', 'band': 'John Surman'}]
lookup += [{'band_string': 'julierhodes', 'band': 'Julie Rhodes'}]
lookup += [{'band_string': 'jwalsh', 'band': 'Joe Walsh'}]
lookup += [{'band_string': 'kathbloom', 'band': 'Kath Bloom'}]
lookup += [{'band_string': 'kottke', 'band': 'Leo Kottke'}]
lookup += [{'band_string': 'l-ficher-g-baton', 'band': 'Laura Fischer & Grand Baton'}]
lookup += [{'band_string': 'larrycampbell_teresawilliams', 'band': 'Larry Campbell & Teresa Wililams'}]
lookup += [{'band_string': 'len price 3', 'band': 'Len Price'}]
lookup += [{'band_string': 'lilsmokies', 'band': "Lil' Smokies"}]
lookup += [{'band_string': 'lizzwright', 'band': 'Lizz Wright'}]
lookup += [{'band_string': 'loslobos', 'band': 'Los Lobos'}]
lookup += [{'band_string': 'loverevisited', 'band': 'Love Revisited'}]
lookup += [{'band_string': 'lucindawilliams', 'band': 'Lucinda Williams'}]
lookup += [{'band_string': 'mando', 'band': 'Mando Diao'}]
lookup += [{'band_string': 'marcuskingband', 'band': 'The Marcus King Band'}]
lookup += [{'band_string': 'marksbrothers', 'band': 'The Marks Brothers'}]
lookup += [{'band_string': 'marthascanlan', 'band': 'Martha Scanlan'}]
lookup += [{'band_string': 'matthires', 'band': 'Matt Hires'}]
lookup += [{'band_string': 'mavis', 'band': 'Mavis Staples'}]
lookup += [{'band_string': 'mavisstaples', 'band': 'Mavis Staples'}]
lookup += [{'band_string': 'mccartney', 'band': 'Paul McCartney'}]
lookup += [{'band_string': 'melvins', 'band': 'The Melvins'}]
lookup += [{'band_string': 'mikecooley', 'band': 'Mike Cooley'}]
lookup += [{'band_string': 'mississippi bigfoot', 'band': 'Mississippi Bigfoot'}]
lookup += [{'band_string': 'mostlyautumn', 'band': 'Mostly Autumn'}]
lookup += [{'band_string': 'neil young', 'band': 'Neil Young'}]
lookup += [{'band_string': 'nineinchnails', 'band': 'Nine Inch Nails'}]
lookup += [{'band_string': 'nma', 'band': 'North Mississippi Allstars'}]
lookup += [{'band_string': 'noelgallagher', 'band': 'Noel Gallagher'}]
lookup += [{'band_string': 'ny', 'band': 'Neil Young'}]
lookup += [{'band_string': 'nyphil', 'band': 'New York Philharmonic'}]
lookup += [{'band_string': 'patd', 'band': 'Panic! at the Disco'}]
lookup += [{'band_string': 'paulsimon', 'band': 'Paul Simon'}]
lookup += [{'band_string': 'peter astor', 'band': 'Peter Astor'}]
lookup += [{'band_string': 'pixies', 'band': 'The Pixies'}]
lookup += [{'band_string': 'pressroomtrio', 'band': 'The Press Room Trio'}]
lookup += [{'band_string': 'ptf', 'band': 'Praise the Fallen'}]
lookup += [{'band_string': 'raitt', 'band': 'Bonnie Raitt'}]
lookup += [{'band_string': 'raydavies', 'band': 'Ray Davies'}]
lookup += [{'band_string': 'rem', 'band': 'R.E.M.'}]
lookup += [{'band_string': 'revtorband', 'band': 'Rev Tor'}]
lookup += [{'band_string': 'rhcp', 'band': 'Red Hot Chili Peppers'}]
lookup += [{'band_string': 'rhythmfuture', 'band': 'Rhythm Future'}]
lookup += [{'band_string': 'richrobinson', 'band': 'Rich Robinson'}]
lookup += [{'band_string': 'robertplant', 'band': 'Robert Plant'}]
lookup += [{'band_string': 'samholtband', 'band': 'The Sam Holt Band'}]
lookup += [{'band_string': 'sbb', 'band': 'SBB'}]
lookup += [{'band_string': 'seldomscene', 'band': 'Seldom Scene'}]
lookup += [{'band_string': 'senior service', 'band': 'Senior Service'}]
lookup += [{'band_string': 'sherylcrow', 'band': 'Sheryl Crow'}]
lookup += [{'band_string': 'sigur_ros', 'band': 'Sigur Rós'}]
lookup += [{'band_string': 'skypilot', 'band': 'Sky Pilot'}]
lookup += [{'band_string': 'smiths', 'band': 'The Smiths'}]
lookup += [{'band_string': 'southside.johnny', 'band': 'Southside Johnny'}]
lookup += [{'band_string': 'spottedtiger', 'band': 'Spotted Tiger'}]
lookup += [{'band_string': 'stuallen', 'band': 'Spotted Tiger'}]
lookup += [{'band_string': 'studs brooks', 'band': 'Studs Brooks'}]
lookup += [{'band_string': 'sundays', 'band': 'The Sundays'}]
lookup += [{'band_string': 'tabata+kamekawa+hikari', 'band': 'Tabata Mitsuru, Kamekawa Chiyo, & Hikari'}]
lookup += [{'band_string': 'tbc', 'band': 'TBC'}]
lookup += [{'band_string': 'tboh', 'band': 'The Band of Heathens'}]
lookup += [{'band_string': 'tedeschitrucksband', 'band': 'Tedeschi Trucks Band'}]
lookup += [{'band_string': 'the.steepwater.band', 'band': 'The Steepwater Band'}]
lookup += [{'band_string': 'theweight', 'band': 'The Weight'}]
lookup += [{'band_string': 'thirdeyeblind', 'band': 'Third Eye Blind'}]
lookup += [{'band_string': 'timobrien', 'band': "Tim O'Brien"}]
lookup += [{'band_string': 'tromboneshorty', 'band': 'Trombone Shorty'}]
lookup += [{'band_string': 'ttb', 'band': 'Tedeschi Trucks Band'}]
lookup += [{'band_string': 'vanmorrison', 'band': 'Van Morrison'}]
lookup += [{'band_string': 'vh', 'band': 'Van Halen'}]
lookup += [{'band_string': 'virginmarys', 'band': 'The Virginmarys'}]
lookup += [{'band_string': 'willienelson', 'band': 'Willie Nelson'}]
lookup += [{'band_string': 'wnelson', 'band': 'Willie Nelson'}]
lookup += [{'band_string': 'youngbloods', 'band': 'The Youngbloods'}]
lookup += [{'band_string': 'zodiacmindwarp', 'band': 'Zodiac Mindwarp'}]
lookup += [{'band_string': 'zombies', 'band': 'The Zombies'}]
lookup += [{'band_string': '5 A.M', 'band': '5 A.M.'}]
lookup += [{'band_string': '5 AM', 'band': '5 A.M.'}]
lookup += [{'band_string': 'Adams, Greg', 'band': 'Greg Adams'}]
lookup += [{'band_string': 'AH All Stars', 'band': 'Alan Hertz All-Stars'}]
lookup += [{'band_string': 'AlejandroEscovedo', 'band': 'Alejandro Escovedo'}]
lookup += [{'band_string': 'BLues Man', 'band': 'Blues Man'}]
lookup += [{'band_string': 'Burdon, Eric', 'band': 'Eric Burdon'}]
lookup += [{'band_string': 'Caldwell, Bobby', 'band': 'Bobby Caldwell'}]
lookup += [{'band_string': 'Carlton, Larry', 'band': 'Larry Carlton'}]
lookup += [{'band_string': 'Cold Rain &amp; Snow', 'band': 'Cold Rain & Snow'}]
lookup += [{'band_string': 'Crosby, David', 'band': 'David Crosby'}]
lookup += [{'band_string': 'DWZappa', 'band': 'Dweezil Zappa'}]
lookup += [{'band_string': 'GNR', 'band': "Guns 'n' Roses"}]
lookup += [{'band_string': 'Great expectations', 'band': 'Great Expectations'}]
lookup += [{'band_string': 'Hargrove, Roy', 'band': 'Roy Hargrove'}]
lookup += [{'band_string': 'Iron fist', 'band': 'Iron Fist'}]
lookup += [{'band_string': 'JG MS', 'band': 'Jerry Garcia & Merl Saunders'}]
lookup += [{'band_string': 'JWalsh', 'band': 'Joe Walsh'}]
lookup += [{'band_string': 'Lee, Albert', 'band': 'Albert Lee'}]
lookup += [{'band_string': 'Manfred Manns Earthband', 'band': "Manfred Mann's Earth Band"}]
lookup += [{'band_string': 'Marys Place', 'band': "Mary's Place"}]
lookup += [{'band_string': 'Natures Way', 'band': "Nature's Way"}]
lookup += [{'band_string': 'Ozzy', 'band': 'Ozzy Osbourne'}]
lookup += [{'band_string': 'Rock &amp; Roll Band', 'band': 'Rock & Roll Band'}]
lookup += [{'band_string': 'SatansMillenium', 'band': "Satan's Millenium"}]
lookup += [{'band_string': 'Ten secret doors', 'band': 'Ten Secret Doors'}]
lookup += [{'band_string': 'The Rhythm Of The Heat', 'band': 'The Rhythm Of the Heat'}]
lookup += [{'band_string': 'This is my god', 'band': 'This Is My God'}]
lookup += [{'band_string': 'Vera, Billy', 'band': 'Billy Vera'}]
lookup += [{'band_string': 'WAR OF KINGS', 'band': 'War of Kings'}]
lookup += [{'band_string': 'Waltzinblack', 'band': 'Waltz in Black'}]
lookup += [{'band_string': 'SpunkRock', 'band': 'Spunk Rock'}]
lookup += [{'band_string': 'badreligion', 'band': 'Bad Religion'}]
lookup += [{'band_string': 'bliss avenue', 'band': 'Bliss Avenue'}]
lookup += [{'band_string': 'kamasiwashington', 'band': 'Kamasi Washington'}]
lookup += [{'band_string': 'sly eyes', 'band': 'Sly Eyes'}]

for thisband in lookup:
    df.loc[df['band_string'] == thisband['band_string'], 'band'] = thisband['band']
    df.loc[df['band_string'] == thisband['band_string'], 'abbrev_type'] = 'notfound'
    
print('Identifying band strings with unknown abbreviations...')
unknown_bands = ['B&amp;B', 'CBP', 'EOC &amp; Friends', 'ER', 'FE', 'FL', 'Feldman', 'Fight', 'Grim', 'Healy']
unknown_bands += ['LL', 'LP', 'ReinaCollinsBandCCR', 'SR', 'Schmid-Hagen-Frohn-Haering', 'acornacopia', 'aghosh']
unknown_bands += ['aosw', 'bandb', 'blair', 'blythe', 'bsb', 'cornell', 'crbh', 'davidosky', 'ej_bj', 'high']
unknown_bands += ['hotcreekinacoustics', 'in2ph', 'iq', 'isd', 'jason', 'jlyys', 'jmt', 'julia', 'katie', 'kwmtal']
unknown_bands += ['le', 'lhb', 'lofs', 'rf', 'rm', 'scbb', 'sp', 'tth', 'wj&amp;nl', 'yusuf']
unknown_bands += ['Buben', 'Bul', 'HIM', 'JOEJ', 'LK', 'PAWS', 'R', 'RF', 'RTBand', 'STE', 'costbrist', 'kv']
unknown_bands += ['mcooleyt', 'prrp']

for thisband in unknown_bands:
    df.loc[df['band_string'] == thisband, 'band'] = 'unknown'
    df.loc[df['band_string'] == thisband, 'abbrev_type'] = 'unknown'    

likely_songs = ['Frau, ich freu mich', "Goin' Down", 'How Many More Years', "Maggie's Farm"]
likely_songs += ['Never Give All Your Heart', 'No Good Place for the Lonely', 'Open All Night']
likely_songs += ['Pay Me My Money Down', 'Red Is The Rose', 'Religion', 'The Finish Line']
likely_songs += ['This News Is Captured!', "Where It's At - Good Times - China Girl - Pocket Calculator - 1999"]
likely_songs += ['Woke Up Dreaming']
likely_songs += ["11 O'clock tick tock", "12-8 Theme-Esther's Nose Job", '25 Or 6 To 4', '3 Stooges Intro']
likely_songs += ['A Head Full Of Dreams', 'A House is not a Motel', 'After Midnight']
likely_songs += ['After The Gold Rush', 'After The Goldrush', 'After the Gold Rush', "Ain't I Woman"]
likely_songs += ['All By Myself', 'All Fired Up', 'Always There Anyway', "Apex Predator &acirc;\x80\x93 Easy Meat"]
likely_songs += ["Baby Please Don't Leave", 'Black Magic Woman_Gypsy Queen', 'Blow Your Trumpets Gabriel']
likely_songs += ['Born On The Bayou', "Can't Explain", 'Careful With That Axe, Eugene', 'Carry That Weight']
likely_songs += ['Caught in a mosh', 'Chasing The Wind', 'Come And Get It', 'Comfortably Numb', 'Coming Back To Life']
likely_songs += ['Crane Wife', 'Dames Booze Chains And Boots', 'Dames, Booze, Chains &amp; Boots']
likely_songs += ['Dames, Booze, Chains And Boots', 'Dames, Booze, Chains, &amp; Boots']
likely_songs += ["Damn Right I've Got The Blues", 'DanceOnAVolcano', 'Dancing With the Moonlit Knight']
likely_songs += ['Dazed And Confused', 'Dazzle Ships Part 2, 3 &amp; 7', "Don't Cry no Tears"]
likely_songs += ["Don't Talk To Strangers", 'Earthrise-First Night-Rhayder-Uneven Song']
likely_songs += ['End of my line', 'End of the Line', 'Errant Girl for Rhythm', 'Every Step Of The Way']
likely_songs += ["Everybody's On The Run", "Everybody's on the Run", 'Facelift-Slightly All The Time-Moon In June']
likely_songs += ['Falling In Love Again', 'Finish Your Job', 'Fireworks Medley', "Floyd's Guitar Blues"]
likely_songs += ['Fly Like An Eagle', 'Garden State Parkway Blues', 'Get Right Back', 'Getchall In The Mood']
likely_songs += ['Girls Just Wanna Have Fun', 'Give Me What I Want And Give It', 'Heart Of Gold', 'Hells Bells']
likely_songs += ["Help, I'm A Rock", "Help, I'm a Rock", 'Henery Parsons Died', 'HighballWithTheDevil']
likely_songs += ['Highway 61 Revisited', "Hold On, I'm Coming", 'I Love Being Here With You']
likely_songs += ['I Wanna Be Adored', 'I Want More-Gomorrha-Improvisation-Vernal Equinox']
likely_songs += ['I Wonder (Why Are You So Mean To Me)', "I Wouldn't Need You", 'I am the sea']
likely_songs += ['I wanna hold your hand', "I'm Always in Love", "I'm coming from you"]
likely_songs += ["I've Been Waiting For You", 'I`m On Your Side', 'If I Keep My Heart Out Of Sight']
likely_songs += ['If You Leave', 'Impaled Intro', 'Improvisation-Stone Strike', 'In the long long run']
likely_songs += ['Iron Butterfly Theme', 'Is It My Body', 'Is She Really Going Out With Him', "It's No Use"]
likely_songs += ["It's electric", 'Jesus is Just Alright', 'Jungleland', 'Keep Your Lamp Trimmed And Burning']
likely_songs += ['King of contradiction', 'Lafayette Waltz-Jumbalaya', 'Layla', 'Let It Rock']
likely_songs += ['Life on Mars', 'Light My Fire', 'Like A Rolling Stone', "Livin' On The Edge"]
likely_songs += ["Main Title from 'Terminator 2'", 'Make Me Smile', 'Mama Knows']
likely_songs += ['Miss You', "Mojito Moon (Michaela's Song)", 'Moma Dance', 'Mr. Tambourine Man']
likely_songs += ['Muleskinner Blues', 'My Love Will Not Let You Down', 'Mysterious Ways (feat. Sexual Healing)']
likely_songs += ['Never Gonna Change', 'Never Let Go', 'Nice, Nice, Very Nice', 'Ode to Sad Disco']
likely_songs += ['Once Bitten Twice Shy', "One Of These Days I'll Get An Early Night"]
likely_songs += ['Only You Know And I Know', 'Our Soundtrack', 'Over the Rainbow']
likely_songs += ['Questions 67 &amp; 68', 'Quinn The Eskimo (The Mighty Quinn)']
likely_songs += ['Rainy Day Women #12 &amp; 35', 'Rat Salad with Drum Solo', 'Reptile']
likely_songs += ['Rockin In The Free World', "Rockin' Down The Highway", "Rockin' Down the Highway"]
likely_songs += ["Rockin' In The Free World", 'Second Ballad', 'Second Time Around']
likely_songs += ["Separate Ways (Worlds Apart)", 'Session Musian-Jingles Cabaret Merseybeat']
likely_songs += ['Set The Controls For The Heart Of The Sun', 'Set The House Ablaze', 'Seven Seas Of Rhye']
likely_songs += ['She Belongs To Me', 'She Belongs to Me', 'Shelter Me', 'Shining Star']
likely_songs += ['Shoot A Hole Into The Sun', 'Shout', 'Silver Machine and Waiting For The Man']
likely_songs += ['Smoke On The Water', 'So There', "Some Fella's Heartbreaker", "Somebody's Knocking"]
likely_songs += ['Something In the Way She Moves', 'Somethings Happening', 'Sometimes']
likely_songs += ['Sometimes', 'Son of a Son of a Sailor', 'Son of a gun']
likely_songs += ["Space Truckin' ~ Woman From Tokyo ~ Paint It Black", 'Star Spangled Banner']
likely_songs += ['Stealing Time From The Faulty Plan', 'Suite In C', 'Sunday Bloody Sunday']
likely_songs += ['Synchronicity', 'THE CHANT HAS JUST BEGUN', "Take It All Why Dont'cha"]
likely_songs += ['Take The Night Off', 'Take Us Back', 'Tangled Up In Blue']
likely_songs += ['Tear My Stillhouse Down', 'The Power Of Love', 'The Revolution Will Be Televised']
likely_songs += ['The Sun Roars Into View', 'The_Killer_Is_Me', "Theme-Esther's Nose Job"]
likely_songs += ["There's No Underwear in Space", "Thing's Going On", 'Things Can Only Get Better']
likely_songs += ['Things Have Changed', 'Thick As A Brick', 'Thus Spoke Zarathustra']
likely_songs += ['Tomorrow Is A Long Time', 'WE WISH YOU WELL']
likely_songs += ['Wake Up Little Suzie (The Everly Brothers cover)', 'Walk This Way']
likely_songs += ["Wasn't Yesterday Great", "We're A Happy Family", "We're Not Gonna Take It"]
likely_songs += ["What You Don't Know", 'When The Saints Go Marching']
likely_songs += ['What People Are Made Of', "When You've Got A Good Friend", "When my baby's beside me"]
likely_songs += ["Where It's At", "White Horse Gyspy Sally's", 'Who Do You Love', 'Whole Lotta Love']
likely_songs += ['Wild Night', 'Will You Say You Will', 'William Tell Overture']
likely_songs += ["You Can't Always Get What You Want", "You Don't Love Me", 'You Give Love A Bad Name']
likely_songs += ['You Gotta Believe', "You're My Love Interest", "You're So Fine", 'after the gold rush']
likely_songs += ['doctor doctor', 'dori theme', 'down in the tubestation at midnight']
likely_songs += ["everbody's_on_the_run", 'flying over the odra valley']
likely_songs += ['hey that_s no way to say goodbye', "i'll take care of you"]
likely_songs += ['kyrie aus der messe in h-moll', 'ode to satie', 'pay me my money down']

print('finding band strings which are more likely song names...')
for thisband in likely_songs:
    df.loc[df['band_string'] == thisband, 'band'] = 'notband'
    df.loc[df['band_string'] == thisband, 'abbrev_type'] = 'likelysong'
# this needs to be a special case because REASONS:
df.loc['566920.html', ['band', 'abbrev_type']] = ['notband', 'likelysong']

print('Identifying band strings that are likely not bands or songs or anything useful to our study...')
not_bands = ['Bottom', 'Century Hall', 'M.O', 'Memorial for Pat and Rick', 'Miami', 'OrchestFestLucerne']
not_bands += ['Simpsons', 'StuPhil', 'TBPFS', 'maniacs', 'padded', 'tatadios', 'thanks']
not_bands += ['Announcement', 'Applause', 'Audience', 'Audio Track', 'AudioTrack', 'Band Intro', 'Band Introduction']
not_bands += ['Banter', 'Conversation-Solo Bass', 'Crew Soundchecking Instruments &amp; Mics Pre Show', 'Crowd', 'Cue']
not_bands += ['Dedication To Dick Latvala', 'Dialogue', 'Drumming', 'Encore', 'End', 'Enter Stage', 'First Set']
not_bands += ['INTRO', 'Instrumental', 'Interview', 'Intro', 'Intro #1', 'Intro ( Loony Tunes', 'Intro (Drill Sergeant)']
not_bands += ["Intro (Hybrid Mix of ''Procession'' &amp; ''Flash (Concert Intro Version)'')", 'Intro (Marc Riley)']
not_bands += ['Intro (Sounds!)', 'Intro 2', 'Intro DJ', 'Intro Supertzar', 'Intro Terminator Title Credits']
not_bands += ['Intro and Violet Hill', 'Intro by Michael Tearson', 'Intro pt. 1', 'Intro pt.1', 'Intro.']
not_bands += ['Introduction', 'Matrix', 'Medley', 'Medley-Different For Girls+ISRGOWH-fixed']
not_bands += ['Opening S.E.', 'Overture', 'PART TWO', 'Para Lennon e McCartney', 'Poem', 'Pop music']
not_bands += ['Presentation', 'Punk Rock And Roll', 'Radio Intro', 'Radio intro', 'Sample 2']
not_bands += ['Seattle2Track01', 'Set 1', 'Taking the stage', 'The drum thing', 'Track one', 'TrackNo01']
not_bands += ['TransitAndSidinglyrics', 'Tuning', 'Unknown Track', 'WPIX Introduction']
not_bands += ['[Intro]', '[Soundcheck]', '[audience]', '[venue announcements]', 'applause-talk']
not_bands += ['arrival', 'artist', 'banter', 'commentary', 'crowd', 'crowd banter', 'fixed']
not_bands += ['intro', 'intro (video', 'intro)', 'introduction', 'opening', 'radiointro', 'the blues banter']
not_bands += ['stage chat', 'tribute to keith jarrett', 'unknown title', 'unknown)', 'welcome applause']

for thisband in not_bands:
    df.loc[df['band_string'] == thisband, 'band'] = 'notband'
    df.loc[df['band_string'] == thisband, 'abbrev_type'] = 'notband'

print('finding band strings which are more likely composers...')
composers = ['Brahms', 'Debussy', 'John Cage', 'Mahler', 'Mozart', 'Strauss', 'Teleman', 'Wagner']
for thisband in composers:
    df.loc[df['band_string'] == thisband, 'band'] = 'notband'
    df.loc[df['band_string'] == thisband, 'abbrev_type'] = 'composer'

print('the rest of the band strings are full band names, so set the band value and set abbrev_type to notfound...')
df.loc[(df['abbrev_type'].isnull()) & 
       (df['band_string'].notnull()) & 
       (df['band_string'].apply(lambda x: str(x).islower() == True)), 
      'band'] = df['band_string'][(df['abbrev_type'].isnull()) & 
       (df['band_string'].notnull()) & 
       (df['band_string'].apply(lambda x: str(x).islower() == True))].apply(lambda x: x.capitalize())

df.loc[
    (df['band_string'].notnull()) & 
    (df['abbrev_type'].isnull()) & 
    (df['band_string'].apply(lambda x: str(x).islower())) &
    (df['band_string'].apply(lambda x: str(x).capitalize()) == df['band'])
, 'abbrev_type'] = 'notfound'

df.loc[df['band_string'].notnull() & df['abbrev_type'].isnull(), 'band'] = df['band_string'][df['band_string'].notnull() & df['abbrev_type'].isnull()]
df.loc[df['band_string'].notnull() & df['abbrev_type'].isnull(), 'abbrev_type'] = 'notfound'    


print('\nFound {0:.0f} unique band strings; assigned {1:.0f} unique bands; processed {2:.0f} rows!'.format(
    len(df['band_string'].dropna().drop_duplicates()), 
    len(df['band'].dropna().drop_duplicates()), 
    len(df['band_string'].dropna())))

#print('\nband strings yet to be identified (n = {0:,.0f}):'.format(len(df['band_string'][df['abbrev_type'].isnull()].drop_duplicates())))
#df['band_string'][df['abbrev_type'].isnull()].drop_duplicates().sort_values()[600:] 

showorder = ['canonical', 'variant etree', 'variant archive.org', 'incorrect']
showorder += ['notfound', 'unknown', 'composer', 'likelysong', 'notband', 'blank']

print('\n',df.groupby('abbrev_type').size().reindex(showorder))

print('Joining this work to the music_df dataframe...')
music_df = music_df.join(df[['band', 'abbrev_type']])

print('writing out intermediate data, then dropping...')
df.to_csv(intermediate_datadir + 'band_identifying.csv', encoding='utf-8')
df = df.reset_index().iloc[0:0].drop(df.reset_index().columns.tolist(), axis=1)  # trick to kill a dataframe: https://stackoverflow.com/questions/39173992/drop-all-data-in-a-pandas-dataframe
gc.collect()

print('backing up...')
music_df_bk = music_df

print('done')


retrieving from backup...
joining archive guide file to band name file...
Getting band names for bands in the etree and archive.org list...
Getting band names for bands NOT in the etree and archive.org list...
Identifying band strings with unknown abbreviations...
finding band strings which are more likely song names...
Identifying band strings that are likely not bands or songs or anything useful to our study...
finding band strings which are more likely composers...
the rest of the band strings are full band names, so set the band value and set abbrev_type to notfound...

Found 1506 unique band strings; assigned 991 unique bands; processed 2972 rows!

 abbrev_type
canonical              351 
variant etree          20  
variant archive.org    97  
incorrect              87  
notfound               1392
unknown                77  
composer               11  
likelysong             259 
notband                238 
blank                  440 
dtype: int64
Joining this work to the music_d

In [34]:
print('retrieving from backup...')
music_df = music_df_bk

print('writing out final file...')
music_df.to_csv(datadir+'filenames_firstpass_complete.csv', encoding='utf-8')

print("Done!")

retrieving from backup...
writing out final file...
Done!


In [35]:
z = pandas.read_csv(datadir+'filenames_firstpass_complete.csv', encoding='utf-8', index_col='id')
z

Unnamed: 0_level_0,tracker_site,filename,filetype,stem,date_string,thedate,justyear,justmonth,date_format,band_string,band_string_right_before_date_string,band,abbrev_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
332193a21424db9e6014b3192b365516db5a3de9,LosslessLegs,jgb1974-11-02.gs.aud.castelli.motb.d1t01.flac,flac,jgb1974-11-02.gs.aud.castelli.motb.d1t01,1974-11-02,1974-11-02,,,yyyy-mm-dd,jgb,True,Jerry Garcia Band,canonical
96e76265597a62e9f3a9556776737c473116b3f7,LosslessLegs,gd1992-12-06.AKG451.t01.flac,flac,gd1992-12-06.AKG451.t01,1992-12-06,1992-12-06,,,yyyy-mm-dd,gd,True,Grateful Dead,canonical
f87760707cc1f41d6ca4cf566995e49d397e79cf,LosslessLegs,02 Tear My Stillhouse Down.flac,flac,02 Tear My Stillhouse Down,,,,,,Tear My Stillhouse Down,False,notband,likelysong
6dd5d5a349fea7b0a59fc5fa35a2070008c9498e,LosslessLegs,smiths1984-02-24t04.flac,flac,smiths1984-02-24t04,1984-02-24,1984-02-24,,,yyyy-mm-dd,smiths,True,The Smiths,notfound
cd1c8a96322419332b10e785012c7c15e97ee44c,LosslessLegs,StingGabriel2016-06-26Camden.t18.flac,flac,StingGabriel2016-06-26Camden.t18,2016-06-26,2016-06-26,,,yyyy-mm-dd,StingGabriel,True,Sting & Peter Gabriel,notfound
63522bdcfb22394970ebdf48c9f757be909369c8,LosslessLegs,TrackNo01.flac,flac,TrackNo01,,,,,,TrackNo01,False,notband,notband
27b5823ccebd72da0c4816eccef6be226b3924e8,LosslessLegs,widespreadpanic2016-06-21t01.cm300s.cp1s.16bit.flac,flac,widespreadpanic2016-06-21t01.cm300s.cp1s.16bit,2016-06-21,2016-06-21,,,yyyy-mm-dd,widespreadpanic,True,Widespread Panic,incorrect
ba7d86ba173e83f002eee50095f03e75785ec626,LosslessLegs,jg+ms1974-11-05a-new-d1t01.shn,shn,jg+ms1974-11-05a-new-d1t01,1974-11-05,1974-11-05,,,yyyy-mm-dd,jg+ms,True,Jerry Garcia & Merl Saunders,notfound
9184a411bc37d4eef784a4f6aea29a2e3b7e41fb,LosslessLegs,garcia74-11-05d1t01.flac,flac,garcia74-11-05d1t01,74-11-05,1974-11-05,,,yy-mm-dd,garcia,True,Jerry Garcia,canonical
7e6510d783b6e191a3946c8e808e51b896cdbd27,LosslessLegs,gd1992-12-06.AKG451.t01.flac,flac,gd1992-12-06.AKG451.t01,1992-12-06,1992-12-06,,,yyyy-mm-dd,gd,True,Grateful Dead,canonical
