In [51]:
# Imports

import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup

import re

import os
import json
from json.decoder import JSONDecodeError

from tqdm import tqdm

In [172]:
# Functions

def write(df, filename):
    df.to_csv(f"data/{filename}.csv", index=False)
    
def read(filename):
    return pd.read_csv(f"data/{filename}.csv")


# Collecting songs

def get_songs_of_year(year):
    return pd.read_html(f"https://kworb.net/spotify/songs_{year}.html")[0]

def get_all_songs():
    dfs = []
    for year in tqdm(range(2010, 2024)):
        df = get_songs_of_year(year)
        df['year'] = year
        dfs.append(df)
        
    return pd.concat(dfs)

def separate_artist_title(artist_title):
    parts = artist_title.split(" - ", 1)
    return parts[0], parts[1]

def clean_songs(df):
    df["artist_title"] = df["Artist and Title"].apply(separate_artist_title)
    df["artist"] = df["artist_title"].apply(lambda t: t[0])
    df["title"] = df["artist_title"].apply(lambda t: t[1])
    return df[["artist", "title", "year"]]

def get_artist_list(df):
    return df["artist"].value_counts().index.to_list()

def get_wiki_url(artist):
    return f"https://en.wikipedia.org/wiki/{artist.title()}"

def get_wiki_excerpt(artist):
    url = get_wiki_url(artist)
    response = requests.get(url)
    
    # If no page is found, return None
    if response.status_code != 200:
        return None
    
    soup = BeautifulSoup(response.content, "html.parser")
    paragraphs = soup.find_all('p')
    
    # Gather all paragraph text
    all_paragraphs_text = ""
    for para in paragraphs:
        all_paragraphs_text += para.get_text(separator=" ")
    
    # Return the first 500 characters
    return all_paragraphs_text[:500]

def only_alpha(text):
    return "".join([c for c in text if c.isalpha() or c.isspace()])

def determine_artist_info(artist):
    # Define keywords
    individual_keywords = ["He", "She", "born", "singer", "songwriter", "rapper"]
    group_keywords = ["band", "group", "duo", "trio", "They", "formed", "collective"]
    
    male_keywords = ["He", "him", "his"]
    female_keywords = ["She", "her", "hers"]
    
    # Get wiki excerpt
    excerpt = get_wiki_excerpt(artist)
    if not excerpt:
        return None, None, None
    
    excerpt_alpha = only_alpha(excerpt)
    excerpt_words = excerpt_alpha.split(" ")
    
    # Count the occurences of these keywords in the excerpt
    individual_ct = 0
    group_ct = 0
    male_ct = 0
    female_ct = 0
    
    for word in excerpt_words:
        if word in individual_keywords:
            individual_ct += 1
        if word in group_keywords:
            group_ct += 1
        if word in male_keywords:
            male_ct += 1
        if word in female_keywords:
            female_ct += 1
            
    # Evaluate counts and decide
    if individual_ct > group_ct:
        artist_type = "individual"
    else:
        artist_type = "group"
         
    if male_ct > female_ct:
        gender = "male"
    else:
        gender = "female"  
    
    if artist_type == "group":
        gender = None
        
    # Find start date
    start_pattern = r"(\d{4})"
    start_match = re.search(start_pattern, excerpt)
    start_date = start_match.group(1) if start_match else None
    
    # Return info
    return artist_type, gender, start_date

def cache_artist_info(artist, file):
    data = {}

    # Try to read the file
    try:
        with open(file, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        # File does not exist; proceed with an empty dictionary
        pass
    except JSONDecodeError:
        # File exists but is empty or contains invalid JSON; proceed with an empty dictionary
        pass

    # Check if the artist is already in the data
    if artist in data:
        return

    # Determine artist info and add it to the data
    artist_type, gender, start_date = determine_artist_info(artist)
    data[artist] = {
        "type": artist_type,
        "gender": gender,
        "start_date": start_date
    }

    # Write the updated data back to the file
    with open(file, 'w') as f:
        json.dump(data, f, indent=4)
        
def cache_all_artist_info(artists, filename):
    for artist in tqdm(artists):
        cache_artist_info(artist, f"data/{filename}.json")
        
def read_cache(filename):
    return pd.read_json(f"data/{filename}.json").T.reset_index()

def join_songs_artists(songs, artists):
    return songs.merge(artists, on="artist", how="left")

def strip_title(title):
    # Remove anything following " - " and anything inside parentheses or brackets
    stripped_title = re.sub(r' - .*|\(.*?\)|\[.*?\]', '', title).strip()
    return stripped_title

def determine_casing(title):
    alpha_chars = ''.join([c for c in title if c.isascii() and c.isalpha()])
    if not alpha_chars:
        return "mixed"
    
    if title == title.lower():
        return "lower"
    if title == title.upper():
        return "upper"
    return "mixed"

def is_uniform_casing(casing):
    if casing == "mixed":
        return "mixed"
    return "uniform"

def get_release_year(date):
    return int(date[:4])

In [8]:
# Execution of get_all_songs (10-15 sec)
#df_songs = get_all_songs()

100%|███████████████████████████████████████████| 14/14 [00:09<00:00,  1.44it/s]


In [14]:
# Save this to csv
write(df_songs, "songs")

In [150]:
# Collect list of artists
df_songs = read("songs")
df_songs = clean_songs(df_songs)
artists = get_artist_list(df_songs)

In [94]:
# Exection of cache_all_artist_info (10-20 min)
cache_all_artist_info(artists, "artists")

100%|███████████████████████████████████████| 2675/2675 [10:55<00:00,  4.08it/s]


In [103]:
# Save this to csv
df_artists = read_cache("artists")
df_artists = df_artists.rename(columns={"index": "artist"})
save(df_artists, "artists")

In [161]:
# Set up data_5000, first dataset for analysis

df_artists = read("artists")
df_full = join_songs_artists(df_songs, df_artists)
df_full = df_full[df_full["type"] == "individual"]
df_full = df_full[df_full["gender"].isin(["male", "female"])]
df_full = df_full[df_full["start_date"].between(1950, 2005)]
df_full = df_full.rename(columns={"start_date": "birth_year"})
df_full = df_full[["artist", "title", "year", "gender", "birth_year"]]
df_full["title"] = df_full["title"].apply(strip_title)
df_full["casing"] = df_full["title"].apply(determine_casing)
df_full["is_uniform"] = df_full["casing"].apply(is_uniform_casing)
df_full

Unnamed: 0,artist,title,year,gender,birth_year,casing,is_uniform
0,Bruno Mars,Just the Way You Are,2010,male,1985.0,mixed,mixed
1,Don Omar,Danza Kuduro,2010,male,1978.0,mixed,mixed
2,Eminem,Love The Way You Lie,2010,male,1972.0,mixed,mixed
5,Bruno Mars,Talking to the Moon,2010,male,1985.0,mixed,mixed
6,Bruno Mars,Grenade,2010,male,1985.0,mixed,mixed
...,...,...,...,...,...,...,...
9667,KAROL G,MAÑANA SERÁ BONITO,2023,female,1991.0,upper,uniform
9668,Peso Pluma,CARNAL,2023,male,1999.0,upper,uniform
9670,Kylie Minogue,Padam Padam,2023,female,1968.0,mixed,mixed
9672,Joel Corry,What Would You Do?,2023,male,1989.0,mixed,mixed


In [162]:
save(df_full, "data_5000")

In [182]:
# Set up data_billboard, second dataset for analysis

df_bb = read("billboard_advanced")
df_bb["release_year"] = df_bb["release_date"].apply(get_release_year)
df_bb = df_bb[df_bb["year"] >= 2010]
df_bb = df_bb[df_bb["release_year"] >= 2010]
df_bb = df_bb[["title", "artist", "release_year", "explicit", "energy", "valence"]]
df_bb["title"] = df_bb["title"].apply(strip_title)
df_bb["casing"] = df_bb["title"].apply(determine_casing)
df_bb["is_uniform"] = df_bb["casing"].apply(is_uniform_casing)
df_bb

Unnamed: 0,title,artist,release_year,explicit,energy,valence,casing,is_uniform
1000,TiK ToK,Kesha,2010,False,0.837,0.714,mixed,mixed
1001,Need You Now,Lady A,2010,False,0.622,0.231,mixed,mixed
1002,"Hey, Soul Sister",Train,2010,False,0.886,0.795,mixed,mixed
1003,California Gurls,Katy Perry,2012,False,0.754,0.425,mixed,mixed
1004,OMG,USHER,2010,False,0.745,0.326,upper,uniform
...,...,...,...,...,...,...,...,...
2295,Flower Shops,ERNEST,2021,False,0.461,0.227,mixed,mixed
2296,TO THE MOON,JNR CHOI,2021,True,0.650,0.386,upper,uniform
2297,Unholy,Sam Smith,2022,False,0.472,0.238,mixed,mixed
2298,One Mississippi,Kane Brown,2022,False,0.840,0.575,mixed,mixed


In [183]:
save(df_bb, "data_billboard")