In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_text_safely(element, selector):
    try:
        return element.select_one(selector).text.strip()
    except AttributeError:
        return None

def scrape_billboard_hot_100(start_date, end_date):
    base_url = "https://www.billboard.com/charts/hot-100/"
    data = []

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    current_date = start_date
    while current_date <= end_date:
        url = base_url + current_date.strftime("%Y-%m-%d")
        logging.info(f"Scraping data for {current_date.strftime('%Y-%m-%d')}")

        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            chart_items = soup.select("ul.o-chart-results-list-row")

            logging.info(f"Found {len(chart_items)} chart items on the page")

            for item in chart_items:
                try:
                    rank = extract_text_safely(item, "span.c-label.a-font-primary-bold-l")
                    title = extract_text_safely(item, "h3#title-of-a-story")
                    artist = extract_text_safely(item, "span.c-label.a-no-trucate.a-font-primary-s")

                    if rank and title and artist:
                        data.append({
                            'date': current_date.strftime("%Y-%m-%d"),
                            'rank': rank,
                            'song': title,
                            'artist': artist
                        })
                        logging.info(f"Successfully scraped: Rank {rank} - {title} by {artist}")
                    else:
                        logging.warning(f"Incomplete data: Rank: {rank}, Title: {title}, Artist: {artist}")

                except Exception as e:
                    logging.error(f"Error processing chart item: {e}")

            time.sleep(2)  # Add a delay to avoid overwhelming the server

        except requests.RequestException as e:
            logging.error(f"Error fetching data for {current_date.strftime('%Y-%m-%d')}: {e}")

        current_date += timedelta(days=7)  # Billboard charts update weekly

    return pd.DataFrame(data)

# Set the date range
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)

# Scrape the data
billboard_df = scrape_billboard_hot_100(start_date, end_date)

# Save to CSV
billboard_df.to_csv('billboard_hot_100_2023.csv', index=False)
logging.info("Data saved to billboard_hot_100_2023.csv")
print(f"Total songs scraped: {len(billboard_df)}")

# Display the first few rows of the dataframe
print(billboard_df.head())

Total songs scraped: 2600
         date rank                               song        artist
0  2023-01-01    1    All I Want For Christmas Is You  Mariah Carey
1  2023-01-01    2  Rockin' Around The Christmas Tree    Brenda Lee
2  2023-01-01    3                   Jingle Bell Rock   Bobby Helms
3  2023-01-01    4                     Last Christmas         Wham!
4  2023-01-01    5            A Holly Jolly Christmas     Burl Ives


In [None]:
pip install spotipy pandas tqdm

Collecting spotipy
  Downloading spotipy-2.24.0-py3-none-any.whl.metadata (4.9 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.8-py3-none-any.whl.metadata (9.2 kB)
Collecting async-timeout>=4.0.3 (from redis>=3.5.3->spotipy)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Downloading spotipy-2.24.0-py3-none-any.whl (30 kB)
Downloading redis-5.0.8-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.6/255.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Installing collected packages: async-timeout, redis, spotipy
Successfully installed async-timeout-4.0.3 redis-5.0.8 spotipy-2.24.0


In [None]:
pip install selenium webdriver_manager feedparser

Collecting selenium
  Downloading selenium-4.24.0-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Col

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import random

def generate_concert_data(artists, venues, start_date, end_date):
    concert_data = []

    for artist in artists:
        # Generate 3-15 concerts for each artist
        num_concerts = random.randint(3, 15)
        for _ in range(num_concerts):
            concert_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
            venue, location = random.choice(venues)

            concert_data.append({
                'Artist': artist,
                'Date': concert_date.strftime("%Y-%m-%d"),
                'Venue': venue,
                'Location': location,
                'Event_Type': 'Concert' if random.random() < 0.8 else 'Festival'
            })

    return pd.DataFrame(concert_data)

# Extended list of artists (including original 92 and additional global artists)
artists = [
    "Ariana Grande", "Armani White", "Bad Bunny", "Bailey Zimmerman", "Billie Eilish",
    "Brenda Lee", "Brett Young", "Burna Boy", "Carly Pearce", "Chris Brown",
    "Chris Stapleton", "Chuck Berry", "Cody Johnson", "Cole Swindell", "DaBaby",
    "Dan + Shay", "David Kushner", "Dean Martin", "Dierks Bentley", "Doja Cat",
    "Donny Hathaway", "Drake", "Dua Lipa", "Eagles", "Ed Sheeran",
    "Elvis Presley", "Falling In Reverse", "Future", "Gabby Barrett", "Gunna",
    "Halsey", "Harry Styles", "Hozier", "JVKE", "Jack Harlow",
    "Jason Aldean", "Jax", "Jimmy Buffett", "Jon Pardi", "Jonas Brothers",
    "Jordan Davis", "Kane Brown", "Labrinth", "Lady Gaga", "Lauren Spencer Smith",
    "Lewis Capaldi", "Lil Baby", "Lil Nas X", "Linkin Park", "Lizzo",
    "Luke Bryan", "Luke Combs", "Mac DeMarco", "Manuel Turizo", "Megan Moroney",
    "Melanie Martinez", "Miley Cyrus", "Mitski", "Morgan Wallen", "Myke Towers",
    "NF", "Nat King Cole", "Niall Horan", "Nicki Minaj", "Olivia Rodrigo",
    "OneRepublic", "Parker McCollum", "Parmalee", "Paul McCartney", "Russell Dickerson",
    "Sabrina Carpenter", "Sam Smith", "Scotty McCreery", "Selena Gomez", "Shakira",
    "Sia", "Stephen Sanchez", "TWICE", "Tate McRae", "Taylor Swift",
    "The Beach Boys", "The Kid LAROI", "Thomas Rhett", "Tim McGraw", "Travis Scott",
    "Troye Sivan", "Tyler Childers", "Tyler Hubbard", "Vince Guaraldi Trio", "Wham!",
    "Zach Bryan", "blink-182",
    # Additional global artists
    "BTS", "BlackPink", "J Balvin", "Rosalía", "Wizkid", "Aya Nakamura", "Rammstein",
    "Anitta", "Måneskin", "Dimash Kudaibergen", "Cesária Évora", "Daddy Yankee",
    "Andrea Bocelli", "Babymetal", "Céline Dion", "Stromae", "Karol G", "Maluma",
    "Adele", "Elton John", "Coldplay", "U2", "Metallica", "Red Hot Chili Peppers",
    "Beyoncé", "Rihanna", "Justin Bieber", "Eminem", "Kanye West", "Jay-Z",
    "Bruno Mars", "The Weeknd", "Katy Perry", "Lady Gaga", "P!nk", "Maroon 5"
]

# Extended list of global venues
venues = [
    ("Madison Square Garden", "New York, NY, USA"),
    ("O2 Arena", "London, UK"),
    ("AccorHotels Arena", "Paris, France"),
    ("Tokyo Dome", "Tokyo, Japan"),
    ("Sydney Opera House", "Sydney, Australia"),
    ("Wembley Stadium", "London, UK"),
    ("Staples Center", "Los Angeles, CA, USA"),
    ("Mercedes-Benz Arena", "Shanghai, China"),
    ("The SSE Hydro", "Glasgow, UK"),
    ("Ziggo Dome", "Amsterdam, Netherlands"),
    ("Olympiastadion", "Berlin, Germany"),
    ("Rogers Centre", "Toronto, ON, Canada"),
    ("Maracanã Stadium", "Rio de Janeiro, Brazil"),
    ("Arena Ciudad de México", "Mexico City, Mexico"),
    ("Mall of Asia Arena", "Manila, Philippines"),
    ("Royal Albert Hall", "London, UK"),
    ("Red Rocks Amphitheatre", "Morrison, CO, USA"),
    ("Budokan", "Tokyo, Japan"),
    ("Palau Sant Jordi", "Barcelona, Spain"),
    ("The Colosseum at Caesars Palace", "Las Vegas, NV, USA"),
    ("Etihad Stadium", "Manchester, UK"),
    ("Soldier Field", "Chicago, IL, USA"),
    ("Allianz Parque", "São Paulo, Brazil"),
    ("Rod Laver Arena", "Melbourne, Australia"),
    ("3Arena", "Dublin, Ireland"),
    ("Tauron Arena", "Kraków, Poland"),
    ("Barclays Center", "Brooklyn, NY, USA"),
    ("Mercedes-Benz Superdome", "New Orleans, LA, USA"),
    ("The O2", "London, UK"),
    ("Ericsson Globe", "Stockholm, Sweden"),
    ("Lanxess Arena", "Cologne, Germany"),
    ("Scotiabank Arena", "Toronto, ON, Canada"),
    ("Spark Arena", "Auckland, New Zealand"),
    ("Hallenstadion", "Zurich, Switzerland"),
    ("Foro Sol", "Mexico City, Mexico"),
    ("Gelora Bung Karno Stadium", "Jakarta, Indonesia"),
    ("Stade de France", "Saint-Denis, France"),
    ("Estadio Monumental", "Buenos Aires, Argentina"),
    ("Hollywood Bowl", "Los Angeles, CA, USA"),
    ("Taipei Arena", "Taipei, Taiwan")
]

# Generate data for the year 2023
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)

concert_df = generate_concert_data(artists, venues, start_date, end_date)

# Save to CSV
concert_df.to_csv('global_concert_data_2023.csv', index=False)

print(f"Data saved to global_concert_data_2023.csv")
print(f"Total concerts generated: {len(concert_df)}")

# Display the first few rows and some basic stats
print(concert_df.head())
print(concert_df.describe())

# Show number of concerts per artist
artist_concert_counts = concert_df['Artist'].value_counts()
print("\nNumber of concerts per artist:")
print(artist_concert_counts.head(20))  # Showing top 20 for brevity

# Show most common venues
venue_counts = concert_df['Venue'].value_counts()
print("\nTop 20 most common venues:")
print(venue_counts.head(20))

# Show distribution of concert types
event_type_counts = concert_df['Event_Type'].value_counts()
print("\nDistribution of event types:")
print(event_type_counts)

Data saved to global_concert_data_2023.csv
Total concerts generated: 1166
          Artist        Date                      Venue  \
0  Ariana Grande  2023-03-15              Soldier Field   
1  Ariana Grande  2023-06-18  Gelora Bung Karno Stadium   
2  Ariana Grande  2023-02-24             Olympiastadion   
3  Ariana Grande  2023-01-17              Lanxess Arena   
4  Ariana Grande  2023-12-09                Spark Arena   

                Location Event_Type  
0       Chicago, IL, USA   Festival  
1     Jakarta, Indonesia    Concert  
2        Berlin, Germany    Concert  
3       Cologne, Germany    Concert  
4  Auckland, New Zealand   Festival  
           Artist        Date    Venue    Location Event_Type
count        1166        1166     1166        1166       1166
unique        127         343       40          33          2
top     Lady Gaga  2023-01-01  Budokan  London, UK    Concert
freq           17           8       41         107        914

Number of concerts per artist:
A