## Pearl Jam Setlist Webscraper

In [None]:
########################################
#Program: Setlist_Scraper.py
#Author: Seth Thompson
#Date: 9/17/2019
#Description: Gathers data from all PJ
#             setlists
########################################
#***************************************
#           DISCLAIMER!!!
# This program is not intended for any 
# commercial use and shall NOT be used
# for any monetary gain or commercial
# reasons. This is merely an exercise 
# in web scraping and the data is 
# intended for personal use only.
#***************************************

In [2]:
## Import statements
import numpy as np
import pandas as pd
import urllib.request
import requests
import time
from bs4 import BeautifulSoup

In [3]:
## link is a base url that will be used to loop though all shows
link = 'http://www.livefootsteps.org/setlist/?show_id='

In [5]:
## Empty list to hold records 
## date, location, venue, setlist[] will be stored
records = []
num_shows = 1037

for i in range(1, (num_shows + 1)):
    url = link + str(i)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    results = soup.findAll('h6')
    date = soup.find('h1').get_text().strip()
    location = soup.find('h3',style="margin-left:20px;color:#959595;").get_text()
    venue = soup.find('h3',style="margin-bottom:1px;").get_text()
    setlist=[]
    #print(i) ##Simply used to see what iteration is currently being processed
    
    for j in range(1,len(results)-1):
        if (results[j].get_text() != 'LIVE DEBUT'): ## Discard this information 
            setlist.append(results[j].get_text())
    
    if (len(setlist) > 0):
        setlist.append(setlist)
    else:
        setlist.append('No setlist information available...') ## Some setlist info not available
    
    records.append((date, location, venue, setlist)) ## append a tuple to records
    time.sleep(1) ## Delay as to not overwhelm website


In [6]:
## pandas dataframe to store all records in a neat format
df = pd.DataFrame(records, columns=['date', 'location', 'venue', 'setlist'])
df.index = np.arange(1, len(df)+1)

In [7]:
df.head()

Unnamed: 0,date,location,venue,setlist
1,"October 22, 1990","Seattle, WA",Off Ramp Café,"[Release, Alone, Alive, Once, Even Flow, Black..."


In [99]:
df.tail()

Unnamed: 0,date,location,venue,setlist
1033,"August 13, 2018","Missoula, MT",Washington-Grizzly Stadium,"[Pendulum, Low Light, Go, Do The Evolution, Ar..."
1034,"August 18, 2018","Chicago, IL",Wrigley Field,"[Wash, Low Light, Elderly Woman Behind The Cou..."
1035,"August 20, 2018","Chicago, IL",Wrigley Field,"[Given To Fly, Why Go, Go, Last Exit, Mind You..."
1036,"September 2, 2018","Boston, MA",Fenway Park,"[Sometimes, Release, Low Light, Elderly Woman ..."
1037,"September 4, 2018","Boston, MA",Fenway Park,"[Given To Fly, Animal, Save You, Arms Aloft, L..."


In [100]:
## Store in a csv to be used for later processing
df.to_csv('PJ_Setlists.csv', index = True, index_label='Show #', encoding='utf-8')