# 📘 Notebook Summary – 02_Data_Cleaning_and_Preprocessing.ipynb

This notebook prepares the collected Billboard + lyrics dataset for downstream analysis. It focuses on consolidating data from multiple years, handling missing or malformed entries, and performing essential text preprocessing to ready the lyrics for NLP tasks like sentiment analysis, topic modeling, and word frequency.

## Key Features:

* **Dataset Consolidation:** Merges year-wise CSV files into a single DataFrame for unified processing.

* **Missing Data Identification:** Detects songs with missing or empty lyrics and summarizes their count.

* **Text Cleaning:** Applies lowercasing, punctuation removal, stopword filtering, and lemmatization for each lyrics entry.

* **NLP Readiness:** Prepares the data for sentiment and topic analysis by ensuring consistent formatting and structure.

This notebook marks the transition from raw collection to refined, structured text analysis.

In [2]:
# nltk.download('all')

import requests
from bs4 import BeautifulSoup
import pandas as pd
import aiohttp
import asyncio
import re
import time
import logging
from tqdm.asyncio import tqdm_asyncio
import nest_asyncio
import os
import json
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import zipfile
from google.colab import files
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception
from functools import partial
import signal
from getpass import getpass

## 📂 Loading & Merging Yearly CSVs

Loops through all year-based .csv files (hot100_1959.csv to hot100_2024.csv) and appends them into one large DataFrame.

Uses error handling to skip missing files.

Result: all_years DataFrame containing all songs and lyrics from 1959 to 2024



In [3]:
uploaded = files.upload()

with zipfile.ZipFile('hot100_data_all_years.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

print("✅ Files extracted!")

Saving hot100_data_all_years.zip to hot100_data_all_years.zip
✅ Files extracted!


In [4]:
# --------------------------
# STEP 1: LOAD AND COMBINE DATA
# --------------------------

def load_all_data(years):

    """Load all yearly CSV files into a single DataFrame"""

    all_data = []

    for year in years:
        file_path = f"hot100_{year}.csv"
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            df['Year'] = year
            all_data.append(df)

    if not all_data:
        raise ValueError("No data files found")

    return pd.concat(all_data, ignore_index=True)

In [5]:
all_years = load_all_data(list(range(1959, 2025)))

## ❓ Identifying Missing Lyrics

Creates a subset DataFrame (df_missing_lyrics) with songs where lyrics are either:

NaN, or empty strings ("")

Outputs the shape to summarize how many lyrics are missing.

Useful for tracking what remains to be filled manually or via other means.



In [6]:
df_missing_lyrics = all_years[all_years['Lyrics'].isna() | (all_years['Lyrics'].str.strip() == "")]

df_missing_lyrics.shape

(1088, 5)

In [None]:
try:
    for idx, row in df_missing_lyrics.iterrows():
        title = row['Title']
        artist = row['Artist']

        print(f"\n🔍 {title} {artist} lyrics")
        lyrics = input("📋 Paste the lyrics here:\n")

        all_years.at[idx, 'Lyrics'] = lyrics
        print(f"✅ Lyrics added for '{title}' by {artist} (index {idx})")

except KeyboardInterrupt:
    print("\n🚫 Stopped by user. Progress saved.")


🔍 Come Softly to Me The Fleetwoods lyrics

🚫 Stopped by user. Progress saved.


## Step 3: Clean Lyrics Text

Here we:

* Validate the dataset to make sure there aren't any duplicates and the ranking is in order (No ranks outside the expected range 0-100).

* Clean the lyrics by removing punctuation, converting text to lowercase, and eliminating irrelevant characters (e.g., line breaks, extra spaces etc.).

* Normalize the text for consistent tokenization and modeling.

In [7]:
#Check for duplicates
duplicates = all_years.duplicated(subset=['Title','Artist','Year'], keep=False).sum()
print(f"Found {duplicates} duplicate song entries")

Found 0 duplicate song entries


In [8]:
#Validate ranks (should be 1-100)
invalid_ranks = all_years[(all_years['Rank'] < 1) | (all_years['Rank'] > 100)]
print(f"Found {len(invalid_ranks)} entries with invalid ranks")


Found 0 entries with invalid ranks


In [9]:
# Artist name standardization
def clean_artist_name(name):
    """Standardize artist names with handling of 'feat.' and special cases"""

    if not isinstance(name, str):
        return name

    # Remove content in parentheses (common for disambiguation)
    name = re.sub(r'\([^)]*\)', '', name)

    # Standardization with word boundaries and space preservation
    name = re.sub(r'\b(ft|feat|featuring|with)\b[\.]*\s*', 'feat. ', name, flags=re.IGNORECASE)

    # Preserve essential special characters (apostrophes, hyphens, commas, periods)
    name = re.sub(r"[^a-zA-Z0-9\s',\.\-&]", '', name)

    # Trim whitespaces and convert to title case
    name = name.strip().title()

    # Ensure space after feat.

    name = re.sub(r'feat\.(\S)', r'feat. \1', name)  # Add space if missing
    name = re.sub(r'\s+', ' ', name)  # Remove extra spaces
    name = re.sub(r'\bAnd\b', 'and', name)  # Lowercase conjunctions

    return name

# Apply cleaning
all_years['Artist'] = all_years['Artist'].apply(clean_artist_name)

# Verify fixes
print("Fixed artist name examples:")
fixed_samples = [
    ('Rihanna featuring Jay-Z', 'Rihanna Feat. Jay-Z'),
    ('Lil\' Kim featuring Da Brat, Left Eye, Missy Elliott and Angie Martinez',
     'Lil\' Kim Feat. Da Brat, Left Eye, Missy Elliott and Angie Martinez'),
    ('Ja Rule featuring Ashanti', 'Ja Rule Feat. Ashanti')
]

for original, expected in fixed_samples:
    cleaned = clean_artist_name(original)
    print(f"Original: '{original}'")
    print(f"Cleaned:  '{cleaned}'")
    print(f"Status:   {'✅' if cleaned == expected else '❌'} Expected '{expected}'")
    print()

Fixed artist name examples:
Original: 'Rihanna featuring Jay-Z'
Cleaned:  'Rihanna Feat. Jay-Z'
Status:   ✅ Expected 'Rihanna Feat. Jay-Z'

Original: 'Lil' Kim featuring Da Brat, Left Eye, Missy Elliott and Angie Martinez'
Cleaned:  'Lil' Kim Feat. Da Brat, Left Eye, Missy Elliott and Angie Martinez'
Status:   ✅ Expected 'Lil' Kim Feat. Da Brat, Left Eye, Missy Elliott and Angie Martinez'

Original: 'Ja Rule featuring Ashanti'
Cleaned:  'Ja Rule Feat. Ashanti'
Status:   ✅ Expected 'Ja Rule Feat. Ashanti'



## Remove Stopwords and Tokenize

* Commonly used words (e.g., 'the', 'and') that don't contribute much semantic meaning are removed. This helps improve the performance of downstream models like topic modeling and sentiment analysis.

* Lyrics are split into individual tokens to prepare the data for vectorization and allow the model to analyze frequency and context of each word.

In [15]:
def timeout_handler(signum, frame):
    raise TimeoutError("Lyrics processing timed out")

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_lyrics(lyrics, remove_stopwords=True):
    """
    Core lyrics cleaning function with complete implementation
    """
    if pd.isna(lyrics) or not isinstance(lyrics, str) or lyrics.strip() == "":
        return ""

    try:
        # Remove song structure markers
        lyrics = re.sub(r'\[.*?\]', ' ', lyrics)  # Remove [Verse], [Chorus], etc.
        lyrics = re.sub(r'\(.*?\)', ' ', lyrics)   # Remove (spoken), (backing vocals)

        # Handle newlines and special whitespace
        lyrics = re.sub(r'\r\n|\n|\r', ' ', lyrics)  # Convert newlines to spaces
        lyrics = re.sub(r'\s+', ' ', lyrics)         # Collapse multiple spaces

        # Standardize contractions
        contractions = {
            "i'm": "i am", "you're": "you are", "he's": "he is", "she's": "she is",
            "it's": "it is", "we're": "we are", "they're": "they are", "that's": "that is",
            "ain't": "am not", "aren't": "are not", "can't": "cannot", "could've": "could have",
            "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
            "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will",
            "i'd": "i would", "i'll": "i will", "i've": "i have", "isn't": "is not",
            "let's": "let us", "might've": "might have", "must've": "must have", "needn't": "need not",
            "she'd": "she would", "she'll": "she will", "should've": "should have", "that'd": "that would",
            "there's": "there is", "they'd": "they would", "they'll": "they will", "they've": "they have",
            "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we've": "we have",
            "weren't": "were not", "what'll": "what will", "what're": "what are", "what's": "what is",
            "where's": "where is", "who's": "who is", "won't": "will not", "would've": "would have",
            "you'd": "you would", "you'll": "you will", "you've": "you have"
        }
        for cont, expanded in contractions.items():
            lyrics = re.sub(r'\b' + cont + r'\b', expanded, lyrics, flags=re.IGNORECASE)

        # Remove non-essential characters
        lyrics = re.sub(r"[^a-zA-Z0-9' ]", " ", lyrics)  # Keep alphanumeric + apostrophes
        lyrics = re.sub(r"\s+", " ", lyrics).strip()     # Clean whitespace

        # Case normalization
        lyrics = lyrics.lower()

        # Tokenization and NLP processing
        tokens = nltk.word_tokenize(lyrics)

        if remove_stopwords:
            tokens = [word for word in tokens if word not in stop_words]

        # Lemmatization with POS consideration (verb focus)
        pos_tags = nltk.pos_tag(tokens)
        lemmatized_tokens = []
        for word, tag in pos_tags:
            if tag.startswith('V'):  # Verbs
                lemma = lemmatizer.lemmatize(word, pos='v')
            else:  # Default to noun
                lemma = lemmatizer.lemmatize(word)
            lemmatized_tokens.append(lemma)

        return " ".join(lemmatized_tokens)

    except Exception as e:
        logging.error(f"Error cleaning lyrics: {str(e)}")
        return ""

def clean_lyrics_with_timeout(lyrics, timeout=30):
    """Wrapper with timeout for clean_lyrics"""
    try:
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(timeout)
        result = clean_lyrics(lyrics)
        signal.alarm(0)
        return result
    except TimeoutError:
        print(f"⚠️ Timeout processing lyrics (truncated): {lyrics[:50]}...")
        return ""
    except Exception as e:
        print(f"⚠️ Error processing lyrics: {str(e)}")
        return ""

# Apply with progress tracking and interrupt handling
try:
    print("Starting lyrics cleaning (press Ctrl+C to cancel)...")

    # Initialize column if not exists
    if 'Lyrics_Cleaned' not in all_years.columns:
        all_years['Lyrics_Cleaned'] = None

    # Process in chunks for better control
    chunk_size = 500
    total_chunks = len(all_years) // chunk_size + 1

    for i in range(total_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(all_years))
        print(f"Processing chunk {i+1}/{total_chunks} (rows {start_idx}-{end_idx})...")

        # Process current chunk with timeout protection
        all_years.loc[start_idx:end_idx, 'Lyrics_Cleaned'] = (
            all_years.loc[start_idx:end_idx, 'Lyrics']
            .apply(clean_lyrics_with_timeout)
        )

    print("✅ Lyrics cleaning completed!")

except KeyboardInterrupt:
    print("\n🚫 Processing interrupted by user! Partial results saved.")
    # Calculate how many were processed
    processed = all_years['Lyrics_Cleaned'].notna().sum()
    print(f"Processed {processed}/{len(all_years)} records")

except Exception as e:
    print(f"\n❌ Fatal error: {str(e)}")
    raise
finally:
    # Ensure any partial results are preserved
    print("Saving current progress...")
    all_years.to_csv('billboard_hot100_partial.csv', index=False)
    print("Partial results saved to 'billboard_hot100_partial.csv'")

Starting lyrics cleaning (press Ctrl+C to cancel)...
Processing chunk 1/14 (rows 0-500)...
Processing chunk 2/14 (rows 500-1000)...
Processing chunk 3/14 (rows 1000-1500)...
Processing chunk 4/14 (rows 1500-2000)...
Processing chunk 5/14 (rows 2000-2500)...
Processing chunk 6/14 (rows 2500-3000)...
Processing chunk 7/14 (rows 3000-3500)...
Processing chunk 8/14 (rows 3500-4000)...
Processing chunk 9/14 (rows 4000-4500)...
Processing chunk 10/14 (rows 4500-5000)...
Processing chunk 11/14 (rows 5000-5500)...
Processing chunk 12/14 (rows 5500-6000)...
Processing chunk 13/14 (rows 6000-6500)...
Processing chunk 14/14 (rows 6500-6500)...
✅ Lyrics cleaning completed!
Saving current progress...
Partial results saved to 'billboard_hot100_partial.csv'


## Genre Extraction using DeepSeek API
In the initial stage, I attempted to retrieve genre information using the Spotify API. However, the Spotify API only provides genres at the artist level, not the song level. This introduced two key limitations:

* Incomplete Coverage: Many songs had missing genre data because the artist's genre wasn't always available via the API.

* Granularity Issues: Artists are often associated with multiple genres, making it challenging to pinpoint a single genre for individual songs. Furthermore, these genres are based on the artist's overall work rather than the specific song, reducing accuracy.

Given these shortcomings, I decided to use deepseek api for more precise and complete genre classification. This approach provided:

* Song-specific genre tagging, which is more accurate than relying on artist-level metadata.

* Higher coverage, especially for older songs or niche tracks not well-covered by Spotify’s API.

This allowed me to classify genres more contextually and accurately, resolving the issues encountered with the Spotify API.

In [17]:
url = "https://api.deepseek.com/models"

payload={}
headers = {
  'Accept': 'application/json',
  'Authorization': 'Bearer sk-e76b7e9c82ce4aeb94fb19e41c9fe29b'
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)

{"object":"list","data":[{"id":"deepseek-chat","object":"model","owned_by":"deepseek"},{"id":"deepseek-reasoner","object":"model","owned_by":"deepseek"}]}


In [18]:
nest_asyncio.apply()

# Configure these constants
API_KEY = "sk-e76b7e9c82ce4aeb94fb19e41c9fe29b"
API_URL = "https://api.deepseek.com/v1/chat/completions"
CONCURRENCY_LIMIT = 30  # Higher since no strict rate limit
REQUEST_TIMEOUT = 300  # 5 minutes for slow responses
MAX_ATTEMPTS = 5

@retry(
    stop=stop_after_attempt(MAX_ATTEMPTS),
    wait=wait_exponential(multiplier=1, min=2, max=30),
    retry=retry_if_exception(lambda e: isinstance(e, (aiohttp.ClientError, asyncio.TimeoutError))))


async def fetch_genre(session, artist, title, identifier):

    prompt = (
        f"Predict the most likely music genre for '{title}' by {artist}. "
        "Output ONLY the genre name with no additional text."
    )

    headers = {"Authorization": f"Bearer {API_KEY}"}

    payload = {
        "model": "deepseek-chat",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 8,
        "temperature": 0.1,
        "stop": ["\n"]
    }

    try:
        async with session.post(
            API_URL,
            json=payload,
            headers=headers,
            timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)
        ) as resp:
            # Handle potential long delays
            start_time = time.time()
            while True:
                if resp.content:
                    try:
                        data = await resp.json()
                        return (identifier, data['choices'][0]['message']['content'].strip())
                    except (aiohttp.ContentTypeError, json.JSONDecodeError):
                        # Handle keep-alive comments
                        raw = await resp.text()
                        if 'choices' in raw:
                            json_start = raw.find('{')
                            if json_start != -1:
                                try:
                                    data = json.loads(raw[json_start:])
                                    return (identifier, data['choices'][0]['message']['content'].strip())
                                except json.JSONDecodeError:
                                    pass

                # Timeout check
                if time.time() - start_time > REQUEST_TIMEOUT:
                    raise asyncio.TimeoutError("Response timeout")

                # Wait before checking again
                await asyncio.sleep(0.5)

    except (aiohttp.ClientError, asyncio.TimeoutError) as e:
        error_type = "[TIMEOUT]" if "timed out" in str(e) else "[ERROR]"
        return (identifier, error_type)
    except Exception as e:
        return (identifier, f"[EXCEPTION] {str(e)}")

async def process_song(session, semaphore, row, identifier):
    async with semaphore:
        return await fetch_genre(session, row['Artist'], row['Title'], identifier)

async def get_genres_for_df(df):
    semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
    tasks = []
    results = []

    # Create unique identifier for each row (artist + title + index)
    df['_identifier'] = df.apply(
        lambda row: f"{row['Artist']}|{row['Title']}|{row.name}",
        axis=1
    )

    # Create tasks
    async with aiohttp.ClientSession() as session:
        for idx, row in df.iterrows():
            task = asyncio.create_task(
                process_song(session, semaphore, row, row['_identifier'])
            )
            tasks.append(task)

        # Setup progress bar
        with tqdm(total=len(tasks), desc="Processing songs") as pbar:
            for future in asyncio.as_completed(tasks):
                try:
                    result = await future
                    results.append(result)
                except Exception as e:
                    # Shouldn't happen due to retry, but safe guard
                    identifier = "UNKNOWN_ID"
                    if hasattr(future, '_identifier'):
                        identifier = future._identifier
                    results.append((identifier, f"[UNHANDLED] {str(e)}"))
                finally:
                    pbar.update(1)
                    pbar.set_postfix_str(f"Processed: {pbar.n}/{pbar.total}")

    # Create mapping from identifier to genre
    genre_map = {identifier: genre for identifier, genre in results}

    # Apply results to DataFrame using identifiers
    df['genre'] = df['_identifier'].map(genre_map)
    df.drop(columns=['_identifier'], inplace=True)

    return df

def run_genre_fetch(df):
    total_songs = len(df)

    # Detailed cost estimation
    est_cost = total_songs * 0.000035  # Based on $0.035/1000 songs
    approval = input(
        f"About to process {total_songs} songs\n"
        f"Estimated cost: ${est_cost:.5f}\n"
        f"Concurrency: {CONCURRENCY_LIMIT} requests\n"
        "Proceed? (yes/no): "
    ).strip().lower()

    if approval != 'yes':
        print("Operation cancelled")
        return df

    print("Starting processing... Press Ctrl+C to cancel and save partial results")
    try:
        start_time = time.time()
        df = asyncio.run(get_genres_for_df(df))
        elapsed = time.time() - start_time
        print(f"\nCompleted in {elapsed/60:.1f} minutes")
    except KeyboardInterrupt:
        print("\nProcess interrupted by user - saving partial results")
        return df

    # Analysis
    success_count = df['genre'].apply(
        lambda x: x not in ["", "[ERROR]", "[TIMEOUT]", "[EXCEPTION]", "[UNHANDLED]"] and not x.startswith("[")
    ).sum()
    error_count = total_songs - success_count

    print(f"\nResults: {success_count} succeeded | {error_count} failed")
    if error_count > 0:
        error_types = df[df['genre'].str.startswith('[')]['genre'].value_counts()
        print("Error breakdown:")
        print(error_types.to_string())

    return df


In [None]:
all_years_genre = run_genre_fetch(all_years)

In [22]:
# Filter the rows where genre is '[ERROR]'
error_df = all_years_genre[all_years_genre['genre'] == '[ERROR]'].copy()

# Re-run genre classification on those rows
fixed_genres_df = run_genre_fetch(error_df)

all_years_genre.loc[fixed_genres_df.index, 'genre'] = fixed_genres_df['genre']

About to process 0 songs
Estimated cost: $0.00000
Concurrency: 30 requests
Proceed? (yes/no): yes
Starting processing... Press Ctrl+C to cancel and save partial results


Processing songs: 0it [00:00, ?it/s]


Completed in 0.0 minutes

Results: 0.0 succeeded | 0.0 failed





In [27]:
all_years_genre[all_years_genre['genre'] == '[ERROR]']

all_years_genre = all_years_genre.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

Unnamed: 0,Rank,Title,Artist,Lyrics,Year,Lyrics_Cleaned,profanity_words,profanity_count,contains_profanity,genre,consolidated_genre
0,1,The Battle of New Orleans,Johnny Horton,The Battle Of New Orleans\n\n(Jimmy Driftwood)...,1959,battle new orleans 1814 take little trip along...,[],0.0,False,folk,Folk
1,2,Mack the Knife,Bobby Darin,"Oh, the shark, babe, has such teeth, dear\r\nA...",1959,oh shark babe teeth dear show pearly white jac...,"['tawdry', 'tawdry']",2.0,True,Jazz,Jazz
2,3,Personality,Lloyd Price,Over and over\rI tried to prove my love to you...,1959,try prove love friend say fool fool cause get ...,[],0.0,False,rhythm and blues,R&B
3,4,Venus,Frankie Avalon,Hey Venus oh Venus\r\nVenus if you will\r\nPle...,1959,hey venus oh venus venus please send little gi...,[],0.0,False,rock and roll,Rock
4,5,Lonely Boy,Paul Anka,"I'm just a lonely boy, lonely and blue\r\nI'm ...",1959,lonely boy lonely blue alone nothin ' get ever...,[],0.0,False,Rock and Roll,Rock
...,...,...,...,...,...,...,...,...,...,...,...
6495,96,Bulletproof,Nate Smith,Paroles de la chanson Bulletproof par Nate Smi...,2024,parole de la chanson bulletproof par nate smit...,[],0.0,False,country,Country
6496,97,Fe!n,Travis Scott Feat. Playboi Carti,,2024,,,,False,trap,Hip-Hop
6497,98,The Painter,Cody Johnson,She talks about the future like she's flippin'...,2024,talk future like flippin ' magazine find beaut...,[],0.0,False,country,Country
6498,99,Down Bad,Taylor Swift,Did you really beam me up?\r\nIn a cloud of sp...,2024,really beam cloud sparkle dust experiment tell...,"['fuck', 'fuck', 'fuck', 'fuck', 'naked', 'fuc...",18.0,True,pop,Pop


## Generating and Consolidating Genre Data
After obtaining raw genre predictions from DeepSeek, I exported all of the genres into a CSV file (genres.csv). Since the genre outputs from both methods included variations (e.g., synonyms, subgenres, inconsistent casing), I needed to consolidate these into a smaller, cleaner set of genres.

To do this:

* I used ChatGPT to obtain raw genre labels, which I used to map the genres into a consolidated set of main genres.

* This helped normalize entries like "hip hop", "Hip-Hop", and "rap" under a unified label such as "Hip-Hop".

This cleaned and consolidated genre dataset provides a solid foundation for further analysis, such as visualizing genre trends over time or analyzing the relationship between lyrical themes and genre.

In [37]:
genres= pd.DataFrame(all_years_genre['genre'])

Unnamed: 0,Rank,Title,Artist,Lyrics,Year,Lyrics_Cleaned,profanity_words,profanity_count,contains_profanity,genre,consolidated_genre
0,1,The Battle of New Orleans,Johnny Horton,The Battle Of New Orleans\n\n(Jimmy Driftwood)...,1959,battle new orleans 1814 take little trip along...,[],0.0,False,folk,Folk
1,2,Mack the Knife,Bobby Darin,"Oh, the shark, babe, has such teeth, dear\r\nA...",1959,oh shark babe teeth dear show pearly white jac...,"['tawdry', 'tawdry']",2.0,True,Jazz,Jazz
2,3,Personality,Lloyd Price,Over and over\rI tried to prove my love to you...,1959,try prove love friend say fool fool cause get ...,[],0.0,False,rhythm and blues,R&B
3,4,Venus,Frankie Avalon,Hey Venus oh Venus\r\nVenus if you will\r\nPle...,1959,hey venus oh venus venus please send little gi...,[],0.0,False,rock and roll,Rock
4,5,Lonely Boy,Paul Anka,"I'm just a lonely boy, lonely and blue\r\nI'm ...",1959,lonely boy lonely blue alone nothin ' get ever...,[],0.0,False,Rock and Roll,Rock
...,...,...,...,...,...,...,...,...,...,...,...
6495,96,Bulletproof,Nate Smith,Paroles de la chanson Bulletproof par Nate Smi...,2024,parole de la chanson bulletproof par nate smit...,[],0.0,False,country,Country
6496,97,Fe!n,Travis Scott Feat. Playboi Carti,,2024,,,,False,trap,Hip-Hop
6497,98,The Painter,Cody Johnson,She talks about the future like she's flippin'...,2024,talk future like flippin ' magazine find beaut...,[],0.0,False,country,Country
6498,99,Down Bad,Taylor Swift,Did you really beam me up?\r\nIn a cloud of sp...,2024,really beam cloud sparkle dust experiment tell...,"['fuck', 'fuck', 'fuck', 'fuck', 'naked', 'fuc...",18.0,True,pop,Pop


In [None]:
genres.to_csv('genres.csv')

In [29]:
def consolidate_genre(genre):
    genre = str(genre).lower().strip()

    if any(x in genre for x in ['r&b', 'rnb', 'rhythm and blues']):
        return 'R&B'
    elif 'pop' in genre:
        return 'Pop'
    elif 'rock' in genre:
        return 'Rock'
    elif 'hip hop' in genre or 'rap' in genre or 'hip-hop'in genre or 'trap' in genre:
        return 'Hip-Hop'
    elif 'folk' in genre:
        return 'Folk'
    elif 'country' in genre:
        return 'Country'
    elif 'dance' in genre or 'edm' in genre or 'dubstep' in genre or 'house' in genre or 'electronic' in genre or 'afrobeat' in genre or 'afro-beat' in genre or 'techno' in genre:
        return 'Dance/Electronic'
    elif 'metal' in genre:
        return 'Metal'
    elif 'jazz' in genre:
        return 'Jazz'
    elif 'blues' in genre and 'rhythm' not in genre:
        return 'Blues'
    elif 'soul' in genre:
        return 'Soul'
    elif 'gospel' in genre or 'christian' in genre:
        return 'Gospel/Christian'
    elif 'classical' in genre:
        return 'Classical'
    elif 'reggae' in genre or 'reggaeton' in genre:
        return 'Reggae'
    elif genre in ['[error]', 'unknown', '', 'none']:
        return 'Unknown'
    else:
        return genre.title()

In [30]:
all_years_genre['consolidated_genre'] = all_years_genre['genre'].apply(consolidate_genre)

In [31]:
all_years_genre[
    (all_years_genre['consolidated_genre'] == 'Unknown') |
    (all_years_genre['consolidated_genre'] == 'None') |
    (all_years_genre['consolidated_genre'] == '')
]

Unnamed: 0,Rank,Title,Artist,Lyrics,Year,Lyrics_Cleaned,profanity_words,profanity_count,contains_profanity,genre,consolidated_genre


In [8]:
genre_counts = all_years_genre['consolidated_genre'].value_counts()
genre_counts

Unnamed: 0_level_0,count
consolidated_genre,Unnamed: 1_level_1
Pop,1718
Rock,1520
R&B/Soul,1452
Hip-Hop/Rap,718
Country/Folk,499
Other,217
Electronic/Dance,196
Reggae/Ska,72
Jazz/Blues,63
Gospel/Christian,20


91 genres still feels like a lot so we will further consolidate the genres

In [7]:
# Further Consolidating genres
genre_mapping = {
    # Rock
    "Rock": "Rock", "Grunge": "Rock", "Metal": "Rock", "New Wave": "Rock",
    "Post-Hardcore": "Rock", "Glam Rock": "Rock", "Punk": "Rock", "Merseybeat": "Rock",

    # Pop
    "Pop": "Pop", "Synthpop": "Pop", "Adult Contemporary": "Pop", "Easy Listening": "Pop",
    "Schlager": "Pop", "Bubblegum Pop": "Pop",

    # R&B/Soul
    "R&B": "R&B/Soul", "Soul": "R&B/Soul", "Motown": "R&B/Soul", "Funk": "R&B/Soul",
    "Doo-Wop": "R&B/Soul", "New Jack Swing": "R&B/Soul", "Quiet Storm": "R&B/Soul",

    # Hip-Hop
    "Hip-Hop": "Hip-Hop/Rap", "Rap": "Hip-Hop/Rap", "Crunk": "Hip-Hop/Rap",
    "Trap": "Hip-Hop/Rap", "Drill": "Hip-Hop/Rap", "G-Funk": "Hip-Hop/Rap",

    # Country/Folk
    "Country": "Country/Folk", "Folk": "Country/Folk", "Bluegrass": "Country/Folk",
    "Americana": "Country/Folk", "Honky Tonk": "Country/Folk",

    # Jazz/Blues
    "Jazz": "Jazz/Blues", "Blues": "Jazz/Blues", "Big Band": "Jazz/Blues",
    "Swing": "Jazz/Blues", "Dixieland": "Jazz/Blues", "Bossa Nova": "Jazz/Blues",

    # Electronic
    "Electronic": "Electronic/Dance", "Disco": "Electronic/Dance", "House": "Electronic/Dance",
    "Techno": "Electronic/Dance", "Trance": "Electronic/Dance", "Synthwave": "Electronic/Dance",
    "Freestyle": "Electronic/Dance", "Dubstep": "Electronic/Dance",

    # Reggae
    "Reggae": "Reggae/Ska", "Ska": "Reggae/Ska", "Dancehall": "Reggae/Ska",

    # Latin
    "Salsa": "Latin", "Mambo": "Latin", "Bachata": "Latin", "Soca": "Latin",

    # Classical/Soundtrack
    "Classical": "Classical/Soundtrack", "Film Score": "Classical/Soundtrack",
    "Video Game Music": "Classical/Soundtrack", "Musical Theatre": "Classical/Soundtrack",
    "James Bond Theme": "Classical/Soundtrack",

    # Other
    "Gospel/Christian": "Gospel/Christian",
    "Children'S Music": "Children's",
    "Christmas Music": "Holiday",
    "World Music": "World",
    "Exotica": "World",
    "Novelty": "Other",
    "Comedy": "Other",
}

# Map genres to broader categories
all_years_genre["consolidated_genre"] = all_years_genre["consolidated_genre"].map(genre_mapping).fillna("Other")

In [None]:
genre_counts = all_years_genre['consolidated_genre'].value_counts()
genre_counts

## Profanity Detection and Annotation
To enhance the analysis and enable filtering or profiling of lyrical content, I implemented a profanity detection mechanism using two comprehensive open-source lists:

The [zacanger/profane-words JSON list](https://github.com/zacanger/profane-words).

The [LDNOOBW plaintext list](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words).

These lists were combined and normalized into a single set of profane words.

To account for obfuscated or stylized profanity (e.g., "f@ck", "sh1t", "b!tch"), I created a substitution map to fuzzify the spelling of common profane terms. Each word was transformed into a regex pattern capable of matching various leetspeak variants.

The following annotations were added to the dataset:

* profanity_words: a list of all detected profane words in each song's cleaned lyrics.

* profanity_count: a numeric count of profane words per song.

* contains_profanity: a Boolean indicating whether the song contains any profanity.

This will help support both:

* Exploratory analysis (e.g., trends in explicit content over decades).

* Downstream modeling as an additional input feature, such as filtering before topic modeling or correlating with genre or sentiment.

In [38]:
# Fetch the JSON profane-words list
def fetch_profane_words_json():
    url = "https://raw.githubusercontent.com/zacanger/profane-words/master/words.json"
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print("Error fetching profane-words list:", e)
        return []

# Fetch the plaintext LDNOOBW list
def fetch_ldnoobw_list():
    url = "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text.strip().splitlines()
    except Exception as e:
        print("Error fetching LDNOOBW list:", e)
        return []

# Combine both lists into a set
profane_words = fetch_profane_words_json()
ldnoobw_words = fetch_ldnoobw_list()
combined_profanity = set(word.lower() for word in profane_words + ldnoobw_words)

In [40]:
# Substitution mapping
substitution_map = {
    'a': ['a', '@', '4'],
    'b': ['b', '8'],
    'c': ['c', '(', '{', '[', '<'],
    'e': ['e', '3'],
    'g': ['g', '9'],
    'i': ['i', '1', '!', '|'],
    'l': ['l', '1', '|', '!', '£'],
    'o': ['o', '0'],
    's': ['s', '$', '5'],
    't': ['t', '7', '+'],
    'z': ['z', '2']
}

def fuzzify_word(word):
    pattern = ''
    for char in word.lower():
        if char in substitution_map:
            subs = ''.join(re.escape(c) for c in substitution_map[char])
            pattern += f'[{subs}]'
        else:
            pattern += re.escape(char)
    return pattern

def compile_fuzzy_profanity_pattern(word_list):
    fuzzy_patterns = [fuzzify_word(word) for word in word_list]
    return re.compile(r'\b(?:' + '|'.join(fuzzy_patterns) + r')\b', flags=re.IGNORECASE)

profanity_pattern = compile_fuzzy_profanity_pattern(combined_profanity)

def count_profanity(text):
    if not isinstance(text, str) or not text.strip():
        return None
    return len(profanity_pattern.findall(text))

def extract_profanities(text):
    if not isinstance(text, str) or not text.strip():
        return None
    return profanity_pattern.findall(text)

all_years_genre['profanity_words'] = all_years_genre['Lyrics_Cleaned'].apply(extract_profanities)

all_years_genre['profanity_count'] = all_years_genre['profanity_words'].apply(lambda x: len(x) if isinstance(x, list) else None)

all_years_genre['contains_profanity'] = all_years_genre['profanity_count'].apply(lambda x: x is not None and x > 0)

In [50]:
all_years_genre.to_csv('billboard_hot100_cleaned.csv', index=False)

In [1]:
# Save the cleaned dataset to the repository directory
all_years_genre.to_csv('/content/billboard-lyrics-trend-analysis/billboard_hot100_cleaned.csv', index=False)

NameError: name 'all_years_genre' is not defined