In [84]:
import pandas as pd

In [85]:
pd.set_option('display.max_rows', None)

In [86]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

#Create a dictionary for our URLs to for loop through, each year t's filing is for fiscal year t-1
urls = {
    2006: "https://investors.livenationentertainment.com/sec-filings/annual-reports/content/0000950129-07-001097/h44068e10vk.htm",
    2007: "https://investors.livenationentertainment.com/sec-filings/annual-reports/content/0001193125-08-043193/d10k.htm",
    2008: "https://investors.livenationentertainment.com/sec-filings/annual-reports/content/0001193125-09-045320/d10k.htm",
    2009: "https://investors.livenationentertainment.com/sec-filings/annual-reports/content/0001193125-10-040804/d10k.htm",
    2010: "https://investors.livenationentertainment.com/sec-filings/annual-reports/content/0001193125-11-050243/d10k.htm",
    2011: "https://investors.livenationentertainment.com/sec-filings/annual-reports/content/0001193125-12-075895/d277780d10k.htm",
    2012: "https://investors.livenationentertainment.com/sec-filings/annual-reports/content/0001193125-13-077102/d466140d10k.htm",
    2013: "https://investors.livenationentertainment.com/sec-filings/annual-reports/content/0001335258-14-000027/lyv-20131231x10k.htm",
}

#Create an empty list to append onto
df_list = []

#Start out the for loop to go through each URL
for i, (year, url) in enumerate(urls.items()):
    #Write note to inform of what is being scraped for each year
    print(f"Scraping Live Nation venue control data for {year}...")

    #Get the code from the website
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    #Only going to look through tables, because that's where the venue control data is
    tables = soup.find_all("table")

    #Due to slight differences in tables, need to select for differing columns based on the year's
    #venue control data
    
    if i == 0:
        cols = [0, 4, 6, 9]  #First year
    elif 1 <= i <= 3:
        cols = [0, 4, 6, 8]  #Next 3 years
    elif 4 <= i <= 6:
        cols = [0, 4, 6, 9]  #Next 3 years
    else:
        cols = [0, 1, 2, 3]  #Last year

    stop_scraping = False

    #Look through each table
    for table in tables:
        if stop_scraping:
            break
        #Store the rows
        rows = table.find_all("tr")
        #Loop through each of the rows
        for row in rows:
            if stop_scraping:
                break
            #Take table data and table header
            cells = row.find_all(["td", "th"])
            #Store the data from the table data and header
            cell_texts = [cell.get_text(strip=True).lower() for cell in cells]

            #Look for the word "amphitheater" in the row 
            if any("amphitheater" in text for text in cell_texts):
                #Create a list of the cleaned up elements from the table data
                row_data = [cell.get_text(strip=True) for cell in cells]
                
                if len(row_data) > max(cols):
                    #Stop scraping when it gets to a row that has just "amphitheater" in the amphitheater name 
                    #because the rows including and after that are irrelevant information
                    if row_data[0].strip().lower() == "amphitheater":
                        stop_scraping = True
                        break
                    #Create rows of our data
                    selected_data = [year] + [row_data[col] for col in cols]
                    df_list.append(selected_data)

#Create column names for our dataset
columns = ["Year", "Amphitheater Name", "Venue Type", "Venue Control", "Venue Capacity"]

#Convert to a dataframe
df = pd.DataFrame(df_list, columns=columns)

#Set the index to be Year to track venues over time
df = df.set_index("Year")


Scraping Live Nation venue control data for 2006...
Scraping Live Nation venue control data for 2007...
Scraping Live Nation venue control data for 2008...
Scraping Live Nation venue control data for 2009...
Scraping Live Nation venue control data for 2010...
Scraping Live Nation venue control data for 2011...
Scraping Live Nation venue control data for 2012...
Scraping Live Nation venue control data for 2013...


In [87]:
#Create venue control modality indicator variables
control_types = {
    'Venue Owned': 'owned',
    'Venue EBR': 'booking agreement',
    'Venue Leased': 'lease',
    'Venue Operated': r"license|management"
}

#For loop to go through each variable, making the new variable equal to 1 if the Venue Control variable has
#that phrase in it
for col, pattern in control_types.items():
    df[col] = df['Venue Control'].astype(str).str.contains(pattern, case=False, na=False).astype(int)

import re

#Creating variable "Expiration Year" which will tell us what year the venue control expires
df["Expiration Year"] = df["Venue Control"].astype(str).apply(
    lambda x: int(max(re.findall(r"\d{4}", x))) if re.findall(r"\d{4}", x) else None
)

#Convert from a float to an integer, so we don't have 2024.0 but 2024
df["Expiration Year"] = df["Expiration Year"].astype('Int64') 

#View the resulting dataframe
df

Unnamed: 0_level_0,Amphitheater Name,Venue Type,Venue Control,Venue Capacity,Venue Owned,Venue EBR,Venue Leased,Venue Operated,Expiration Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006,PNC Bank Arts Center,Amphitheater,"22-year lease that expires October 31, 2017",17500,0,0,1,0,2017.0
2006,Nikon at Jones Beach Theater,Amphitheater,20-year license agreement that expiresDecember...,14400,0,0,0,1,2019.0
2006,Randall’s Island,Amphitheater,Booking agreement,20000,0,1,0,0,
2006,Hyundai Pavilion at Glen Helen,Amphitheater,"25-year lease that expires June 30, 2018",65000,0,0,1,0,2018.0
2006,Verizon Wireless Amphitheater,Amphitheater,"20-year lease that expires February 28, 2017",16300,0,0,1,0,2017.0
2006,Gibson Amphitheatre,Amphitheater,"15-year lease that expires September 9, 2014",6185,0,0,1,0,2014.0
2006,First Midwest Bank Amphitheatre,Amphitheater,Owned,28600,1,0,0,0,
2006,Charter One Pavilion at\nNortherly Island,Amphitheater,"3-year lease that expires December 31, 2007",8500,0,0,1,0,2007.0
2006,Tweeter Center at the Waterfront,Amphitheater,"31-year lease that expires September 29, 2025",25000,0,0,1,0,2025.0
2006,Shoreline Amphitheater at\nMountain View,Amphitheater,"20-year lease that expires December 31, 2025",22000,0,0,1,0,2025.0
