In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import csv

# Base URL for ARIA Singles Chart
base_url = "https://www.aria.com.au/charts/singles-chart/"

# Generate weekly dates from 2019-07-01 to the most recent Monday
start_date = datetime(2019, 7, 1)
end_date = datetime(2022, 12, 31)  # Adjust as needed
dates = []
current_date = start_date
while current_date <= end_date:
    dates.append(current_date.strftime('%Y-%m-%d'))
    current_date += timedelta(weeks=1)

# Open a CSV file to store the results
with open('aria_singles_chart_2019_to_2022.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Date', 'Rank', 'Song', 'Artist'])

    # Loop through each date and scrape data
    for chart_date in dates:
        url = f"{base_url}{chart_date}/"
        response = requests.get(url)

        # Debugging: Print status and partial HTML
        print(f"Fetching data for {chart_date}")
        if response.status_code != 200:
            print(f"Failed to fetch data for {chart_date} (Status Code: {response.status_code})")
            continue  # Skip to the next date

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all song titles and artists
        song_titles = soup.find_all('a', class_='c-chart-item__title')
        artist_names = soup.find_all('a', class_='c-chart-item__artist')

        # Check if the lengths of titles and artists match
        if len(song_titles) != len(artist_names):
            print(f"Mismatch in number of songs and artists for {chart_date}. Skipping...")
            continue

        # Write each song and artist to the CSV file
        for rank, (song, artist) in enumerate(zip(song_titles, artist_names), start=1):
            song_name = song.get_text(strip=True)
            artist_name = artist.get_text(strip=True)
            writer.writerow([chart_date, rank, song_name, artist_name])

        print(f"Scraped data for {chart_date}")


Fetching data for 2019-07-01
Scraped data for 2019-07-01
Fetching data for 2019-07-08
Scraped data for 2019-07-08
Fetching data for 2019-07-15
Scraped data for 2019-07-15
Fetching data for 2019-07-22
Scraped data for 2019-07-22
Fetching data for 2019-07-29
Scraped data for 2019-07-29
Fetching data for 2019-08-05
Scraped data for 2019-08-05
Fetching data for 2019-08-12
Scraped data for 2019-08-12
Fetching data for 2019-08-19
Scraped data for 2019-08-19
Fetching data for 2019-08-26
Scraped data for 2019-08-26
Fetching data for 2019-09-02
Scraped data for 2019-09-02
Fetching data for 2019-09-09
Scraped data for 2019-09-09
Fetching data for 2019-09-16
Scraped data for 2019-09-16
Fetching data for 2019-09-23
Scraped data for 2019-09-23
Fetching data for 2019-09-30
Scraped data for 2019-09-30
Fetching data for 2019-10-07
Scraped data for 2019-10-07
Fetching data for 2019-10-14
Scraped data for 2019-10-14
Fetching data for 2019-10-21
Scraped data for 2019-10-21
Fetching data for 2019-10-28
Sc

KeyboardInterrupt: 

In [None]:
import csv

# Input and output file paths
input_file = 'aria_singles_chart_2019_to_2022.csv'  # Replace with your current file name
output_file = 'aria_singles_chart_2019_to_2022_semicolon.csv'  # New file with ; delimiter

# Open the existing file and create a new file with ; delimiter
with open(input_file, mode='r', newline='', encoding='utf-8') as infile, \
     open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
    
    # Read the input file
    reader = csv.reader(infile)
    
    # Write to the output file with ; as the delimiter
    writer = csv.writer(outfile, delimiter=';')
    
    for row in reader:
        writer.writerow(row)

print(f"File has been converted and saved as '{output_file}'.")


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
import os

def construct_genius_url(song, artist):
    """Construct the Genius URL for a given song and artist with proper formatting."""
    artist_url = artist.strip().lower()
    artist_url = re.sub(r'[&]', 'and', artist_url)  # Replace '&' with 'and'
    artist_url = re.sub(r'[.,!]', '', artist_url)  # Remove periods, commas, and exclamation marks
    artist_url = re.sub(r'\s+', '-', artist_url)  # Replace spaces with hyphens

    title_url = song.strip().lower()
    title_url = re.sub(r'\([^)]*\)', '', title_url)  # Remove text within parentheses
    title_url = re.sub(r'\s+', ' ', title_url).strip()  # Replace multiple spaces with a single space
    title_url = re.sub(r'[!?.,]', '', title_url)  # Remove punctuation marks
    title_url = re.sub(r'\s+', '-', title_url)  # Replace spaces with hyphens
    title_url = title_url.replace("'", '')  # Remove apostrophes

    return f"https://genius.com/{artist_url}-{title_url}-lyrics"

def get_lyrics(genius_url):
    """Fetch lyrics from a Genius song URL."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(genius_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        lyrics_container = soup.find('div', {'data-lyrics-container': 'true'})
        if not lyrics_container:
            print("Lyrics container not found.")
            return None

        lyrics = []
        for element in lyrics_container.find_all(['a', 'span']):
            if element.text.strip():
                lyrics.append(element.text.strip())

        return '\n'.join(lyrics)
    except Exception as e:
        print(f"Error fetching lyrics: {e}")
        return None

def scrape_songs_with_lyrics(billboard_file, output_file, start_row=0, end_row=None, existing_songs=None):
    """Scrape lyrics for songs from the given dataset."""
    billboard_df = pd.read_csv(billboard_file)

    if end_row is None:
        end_row = len(billboard_df)

    top_songs_df = billboard_df[(billboard_df['Rank'] >= 1) & (billboard_df['Rank'] <= 20)].iloc[start_row:end_row]

    lyrics_data = []

    if existing_songs is None:
        existing_songs = set()

    for index, row in top_songs_df.iterrows():
        song = row['Song']
        artist = row['Artist']

        if song in existing_songs:
            print(f"Skipping already processed song: {song} by {artist}")
            continue

        print(f"Fetching lyrics for: {song} by {artist}")

        genius_url = construct_genius_url(song, artist)
        print(f"Generated URL: {genius_url}")

        lyrics = get_lyrics(genius_url)
        if lyrics:
            print(f"Lyrics fetched successfully. Preview: {lyrics[:100]}...")
        else:
            print("Lyrics not found.")

        lyrics_data.append({
            'Date': row['Date'],
            'Rank': row['Rank'],
            'Song': song,
            'Artist': artist,
            'Lyrics': lyrics
        })

        existing_songs.add(song)

        if len(lyrics_data) % 5 == 0:
            lyrics_df = pd.DataFrame(lyrics_data)
            lyrics_df.to_csv(output_file, index=False)
            print(f"Saved progress to {output_file}")

        time.sleep(3)  # Delay to prevent blocking

    if lyrics_data:
        lyrics_df = pd.DataFrame(lyrics_data)
        lyrics_df.to_csv(output_file, index=False)
        print(f"Final lyrics saved to {output_file}")

# File paths for Canada dataset
billboard_file = "billboard_hot_100_2019_to_2022.csv"
output_file = "uss_with_lyrics.csv"

# Check if the output file exists and load previously processed songs
if os.path.exists(output_file):
    existing_data = pd.read_csv(output_file)
    existing_songs = set(existing_data['Song'].tolist())
else:
    existing_songs = None

# Execute scraping for the Canada dataset
scrape_songs_with_lyrics(billboard_file, output_file, existing_songs=existing_songs)


In [None]:
import pandas as pd

# Read the CSV file (assuming it's comma-separated or another delimiter)
input_path = "uss_with_lyrics.csv"  # Replace with your file path
df = pd.read_csv(input_path)  # Read the file with its existing delimiter

# Save the file with a semicolon delimiter
output_path = "delimiter_uss.csv"  # Replace with desired output file name
df.to_csv(output_path, index=False, sep=';')

print("File saved with semicolon as the delimiter.")