## Setting up the Environment

**Load the required modules and packages**

### 1. Environment Setup

In [64]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager

# Chrome options
options = webdriver.ChromeOptions()
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)

driver = webdriver.Chrome(options=options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

### 2. Load Existing Song Links Database

In [66]:
try:
    df_links = pd.read_csv("songs.csv")
    song_links = set(df_links["link"].tolist())
except FileNotFoundError:
    print("songs.csv not found. Starting fresh.")
    song_links = set()

In [150]:
print(f"Total song links in database: {len(song_links)}")

Total song links in database: 379


### 3. Scrape Spotify for New Song Links

In [72]:
def scrape_song_links(driver, song_links):
    driver.get('https://open.spotify.com/artist/06HL4z0CvFAxyc27GXpf02/discography/all')
    driver.implicitly_wait(30)
    time.sleep(60)  # Wait for dynamic content to load
    elements = driver.find_elements("xpath", "//div[@class='hb8C1VAjyUg0VMxrwpixac']/a[@class='eYJgrgW01l7dHKuMJidG']")
    new_links = set()
    for elem in elements:
        try:
            url = elem.get_attribute("href")
            if url and url not in song_links:
                new_links.add(url)
        except StaleElementReferenceException:
            continue
    print(f"New links found: {len(new_links)}")
    song_links = song_links.union(new_links)
    return song_links

In [74]:
song_links = scrape_song_links(driver, song_links)
print(f"Total song links in database: {len(song_links)}")

New links found: 0
Total song links in database: 379


In [76]:
# Save updated links
pd.DataFrame({"link": list(song_links)}).to_csv("songs.csv", index=False, header=True)

### 4. Load Existing Song Details Database

In [None]:
try:
    df_details = pd.read_csv("songsdetails.csv")
    details_scraped = set(df_details["SongLink"].tolist())
except FileNotFoundError:
    print("songsdetails.csv not found. No previously scraped details.")
    df_details = pd.DataFrame()
    details_scraped = set()

### 5. Scrape Song Details (for new songs)

In [88]:
# Set implicit wait
driver.implicitly_wait(10)

# Convert song_links to list (if not already)
song_links = list(song_links)

# Initialize dictionary to store information
songsdic = {"SongName": [], "AlbumName": [], "AlbumLink": [], "Year": [], "Runtime": [], "PlayCount": [], "Lyrics": [], "SongLink": []}

# Convert details_scraped to a set for faster lookups
details_scraped = set(details_scraped)

# Define batch size
batchsize = 50
numsongs = len(song_links)

for batchstart in range(0, numsongs, batchsize):
    batchend = min(batchstart + batchsize, numsongs)
    songbatch = song_links[batchstart:batchend]

    for i in songbatch:
        # Skip already-scraped songs
        if i in details_scraped:
            continue  

        driver.get(i)
        time.sleep(10)
        try:
            songname = driver.find_element("xpath", "//div[@class='c55UACltdzzDDQVfoF18']/span[2]/span/h1").get_attribute("textContent").strip()
        except NoSuchElementException:
            songname = "No Name"

        try:
            albumname_elem = driver.find_element("xpath", "//div[@class='JWDnag2Mepdf9QE0cNbg']/span[2]/a")
            albumname = albumname_elem.get_attribute("textContent").strip()
            albumlink = albumname_elem.get_attribute("href")
        except NoSuchElementException:
            albumname = "No Album Name"
            albumlink = "No Album Link"

        try:
            year = driver.find_element("xpath", "//div[@class='JWDnag2Mepdf9QE0cNbg']/span[4]").get_attribute("textContent").strip()
        except NoSuchElementException:
            year = "No Year"

        try:
            runtime = driver.find_element("xpath", "//div[@class='JWDnag2Mepdf9QE0cNbg']/span[6]").get_attribute("textContent").strip()
        except NoSuchElementException:
            runtime = "No Runtime"

        try:
            playcount = driver.find_element("xpath", "//div[@class='JWDnag2Mepdf9QE0cNbg']/span[8]").get_attribute("textContent").strip()
        except NoSuchElementException:
            playcount = "0"

        try:
            lyricselements = driver.find_elements("xpath", "//p[@class='e-91000-text encore-text-body-medium NqaDCx7q6vMeN6tO4Kpf']")
            lyrics = "\n".join([elem.get_attribute("textContent").strip() for elem in lyricselements])
            if not lyrics:
                lyrics = "No Lyrics"
        except Exception:
            lyrics = "No Lyrics"

        # Append scraped data
        songsdic["SongName"].append(songname)
        songsdic["AlbumName"].append(albumname)
        songsdic["AlbumLink"].append(albumlink)
        songsdic["Year"].append(year)
        songsdic["Runtime"].append(runtime)
        songsdic["PlayCount"].append(playcount)
        songsdic["Lyrics"].append(lyrics)
        songsdic["SongLink"].append(i)


In [90]:
# Save new details
if not df_details.empty:
    df_details = pd.concat([df_details, pd.DataFrame(songsdic)], ignore_index=True)
else:
    df_details = pd.DataFrame(songsdic)

In [92]:
df_details.to_csv("songsdetails.csv", index=False)

### 6. Data Cleaning and Feature Engineering

In [124]:
df = df_details.copy()
df.head()

Unnamed: 0,SongName,AlbumName,AlbumLink,Year,Runtime,PlayCount,Lyrics,SongLink
0,The Fate of Ophelia - Alone In My Tower Acoust...,The Fate of Ophelia (Alone In My Tower Acousti...,https://open.spotify.com/album/60pPcuYWCIAdcEr...,2025,3:45,1574606,I heard you calling on the megaphone You wanna...,https://open.spotify.com/track/3Z2nTctPsTsgqqJ...
1,The Fate of Ophelia,The Fate of Ophelia (Alone In My Tower Acousti...,https://open.spotify.com/album/60pPcuYWCIAdcEr...,2025,3:46,371448377,I heard you calling On the megaphone You wanna...,https://open.spotify.com/track/5cKBWgDjB5IXZ5j...
2,The Fate of Ophelia,The Life of a Showgirl,https://open.spotify.com/album/4a6NzYL1YHRUgx9...,2025,3:46,371448377,I heard you calling On the megaphone You wanna...,https://open.spotify.com/track/53iuhJlwXhSER5J...
3,You’re Losing Me (From The Vault),You're Losing Me (From The Vault),https://open.spotify.com/album/5q3jthpn2h59P7p...,2023,4:37,356015345,"You say, ""I don't understand, "" and I say, ""I ...",https://open.spotify.com/track/3CWq0pAKKTWb0K4...
4,Mine (Taylor's Version),Speak Now (Taylor's Version),https://open.spotify.com/album/5AEDGbliTTfjOB8...,2023,3:51,261765959,"Ah-ah, ah ♪ Ah-ah, ah ♪ You were in college, w...",https://open.spotify.com/track/7G0gBu6nLdhFDPR...


In [126]:
df.columns

Index(['SongName', 'AlbumName', 'AlbumLink', 'Year', 'Runtime', 'PlayCount',
       'Lyrics', 'SongLink'],
      dtype='object')

In [128]:
df.loc[df['PlayCount'] == 'No Play Count', 'PlayCount'] = 0


In [130]:
df["Year"] = df["Year"].astype(int, errors='ignore')
df[["Minutes", "Seconds"]] = df["Runtime"].str.split(":", expand=True)
df["Minutes"] = pd.to_numeric(df["Minutes"], errors="coerce").fillna(0).astype(int)
df["Seconds"] = pd.to_numeric(df["Seconds"], errors="coerce").fillna(0).astype(int)
df["Duration"] = df["Minutes"] * 60 + df["Seconds"]
df = df.drop(["Minutes", "Seconds", "Runtime"], axis=1)
df["PlayCount"] = df["PlayCount"].astype(str).str.replace(",", "").astype(float).round(0).astype("Int64")
df["Lyrics"] = df["Lyrics"].fillna("").astype(str)
df["SongName"] = df["SongName"].fillna("").astype(str)
df["AlbumName"] = df["AlbumName"].fillna("").astype(str)

In [None]:
num_duplicates = df.duplicated(subset="Lyrics").sum()
print("Number of duplicate entries based on Lyrics:", num_duplicates)

In [132]:
# Remove duplicates based on Lyrics
df = df.drop_duplicates(subset="Lyrics", keep="first")

In [134]:
# Save cleaned data
df.to_csv("cleanedsongs.csv", index=False)

In [136]:
# Print summary
print(f"{len(df)} unique songs after cleaning and removing duplicates.")

309 unique songs after cleaning and removing duplicates.


### 7. Data Overview and Example Outputs

In [139]:
df.head()

Unnamed: 0,SongName,AlbumName,AlbumLink,Year,PlayCount,Lyrics,SongLink,Duration
0,The Fate of Ophelia - Alone In My Tower Acoust...,The Fate of Ophelia (Alone In My Tower Acousti...,https://open.spotify.com/album/60pPcuYWCIAdcEr...,2025,1574606,I heard you calling on the megaphone You wanna...,https://open.spotify.com/track/3Z2nTctPsTsgqqJ...,225
1,The Fate of Ophelia,The Fate of Ophelia (Alone In My Tower Acousti...,https://open.spotify.com/album/60pPcuYWCIAdcEr...,2025,371448377,I heard you calling On the megaphone You wanna...,https://open.spotify.com/track/5cKBWgDjB5IXZ5j...,226
3,You’re Losing Me (From The Vault),You're Losing Me (From The Vault),https://open.spotify.com/album/5q3jthpn2h59P7p...,2023,356015345,"You say, ""I don't understand, "" and I say, ""I ...",https://open.spotify.com/track/3CWq0pAKKTWb0K4...,277
4,Mine (Taylor's Version),Speak Now (Taylor's Version),https://open.spotify.com/album/5AEDGbliTTfjOB8...,2023,261765959,"Ah-ah, ah ♪ Ah-ah, ah ♪ You were in college, w...",https://open.spotify.com/track/7G0gBu6nLdhFDPR...,231
5,Sparks Fly (Taylor’s Version),Speak Now (Taylor's Version),https://open.spotify.com/album/5AEDGbliTTfjOB8...,2023,176827364,"The way you move is like a full-on rainstorm, ...",https://open.spotify.com/track/3MytWN8L7shNYzG...,261
