In [1]:
# Import the necessary libraries
import pandas as pd
from gensim.models import KeyedVectors
import nltk
from nltk.corpus import wordnet

# Download the 'wordnet' dataset from NLTK, which is useful for finding word synonyms
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/armanenginsucu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Step 1: Load the data from the CSV files into DataFrames
lyrics_df = pd.read_csv('billboard-lyrics-spotify.csv')
profanity_df = pd.read_csv('profanity_en.csv')

# Display a quick preview of the first few rows in both datasets to get a sense of the data
print("Lyrics Data:")
print(lyrics_df.head())

print("\nProfanity Data:")
print(profanity_df.head())

Lyrics Data:
            artist_all          artist_base  rank                       song  \
0          percy faith          percy faith     1  theme from a summer place   
1           jim reeves           jim reeves     2           he'll have to go   
2  the everly brothers  the everly brothers     3              cathy's clown   
3       johnny preston       johnny preston     4               running bear   
4         mark dinning         mark dinning     5                 teen angel   

   year artist_featured                 song_clean         artist_clean  \
0  1960             NaN  theme from a summer place          percy faith   
1  1960             NaN            hell have to go           jim reeves   
2  1960             NaN               cathys clown  the everly brothers   
3  1960             NaN               running bear       johnny preston   
4  1960             NaN                 teen angel         mark dinning   

                                              lyrics  a

In [3]:
# Step 2: Create dictionaries for sexual, racial, and religious content from 'profanity_en.csv'
def build_dictionaries(profanity_df):
    sexual_content_dict = set()
    racial_content_dict = set()
    religious_content_dict = set()

    # loop through the rows and handle cases where some columns may be NaN or missing
    for _, row in profanity_df.iterrows():
        # Use pd.notna() to check that category columns are not NaN (missing values)
        category_1 = row['category_1'] if pd.notna(row['category_1']) else ''
        category_2 = row['category_2'] if pd.notna(row['category_2']) else ''
        category_3 = row['category_3'] if pd.notna(row['category_3']) else ''

        # If the word belongs to the sexual content category, add it to the dictionary
        if 'sexual' in category_1 or 'sexual' in category_2 or 'sexual' in category_3:
            sexual_content_dict.add(row['canonical_form_1'])
            if pd.notna(row['canonical_form_2']):
                sexual_content_dict.add(row['canonical_form_2'])
            if pd.notna(row['canonical_form_3']):
                sexual_content_dict.add(row['canonical_form_3'])

        # If the word belongs to the racial content category, add it to the dictionary
        if 'racial' in category_1 or 'racial' in category_2 or 'racial' in category_3:
            racial_content_dict.add(row['canonical_form_1'])
            if pd.notna(row['canonical_form_2']):
                racial_content_dict.add(row['canonical_form_2'])
            if pd.notna(row['canonical_form_3']):
                racial_content_dict.add(row['canonical_form_3'])

        # If the word belongs to the religious content category, add it to the dictionary
        if 'religious' in category_1 or 'religious' in category_2 or 'religious' in category_3:
            religious_content_dict.add(row['canonical_form_1'])
            if pd.notna(row['canonical_form_2']):
                religious_content_dict.add(row['canonical_form_2'])
            if pd.notna(row['canonical_form_3']):
                religious_content_dict.add(row['canonical_form_3'])

    return sexual_content_dict, racial_content_dict, religious_content_dict

# Build the dictionaries using the function
sexual_content_dict, racial_content_dict, religious_content_dict = build_dictionaries(profanity_df)


In [4]:
# Step 3: Use Word2Vec to extend the dictionaries with related words
# Load the pre-trained Word2Vec model (GoogleNews vectors)
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Function to find top n synonyms for a given word using Word2Vec
def find_synonyms_word2vec(word, model, topn=10):
    try:
        # Fetch the top 'n' most similar words
        synonyms = model.most_similar(word, topn=topn)
        return [syn for syn, score in synonyms]  # Only return the words (synonyms)
    except KeyError:
        return []  # If the word is not found in the model, return an empty list

# Function to extend the dictionary with synonyms using Word2Vec
def extend_dictionary_with_word2vec(dictionary, model):
    extended_dict = set(dictionary)  # Start with the original dictionary
    for word in dictionary:
        synonyms = find_synonyms_word2vec(word, model)
        extended_dict.update(synonyms)  # Add the found synonyms to the dictionary
    return extended_dict

# Step 4: Use WordNet (NLTK) to further extend the dictionaries with synonyms
# Function to find synonyms using WordNet (lexical database)
def find_wordnet_synonyms(word):
    synonyms = set()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name())  # Add each lemma (synonym) to the set
    return synonyms

# Function to extend the dictionary with synonyms from WordNet
def extend_dictionary_with_wordnet(dictionary):
    extended_dict = set(dictionary)  # Start with the original dictionary
    for word in dictionary:
        synonyms = find_wordnet_synonyms(word)
        extended_dict.update(synonyms)  # Add the found synonyms to the dictionary
    return extended_dict

# Now, extend each dictionary using both Word2Vec and WordNet
# Start by extending the sexual content dictionary with Word2Vec, then WordNet
extended_sexual_content_dict = extend_dictionary_with_word2vec(sexual_content_dict, word2vec_model)
extended_sexual_content_dict = extend_dictionary_with_wordnet(extended_sexual_content_dict)

# Do the same for the racial content dictionary
extended_racial_content_dict = extend_dictionary_with_word2vec(racial_content_dict, word2vec_model)
extended_racial_content_dict = extend_dictionary_with_wordnet(extended_racial_content_dict)

# And finally, for the religious content dictionary
extended_religious_content_dict = extend_dictionary_with_word2vec(religious_content_dict, word2vec_model)
extended_religious_content_dict = extend_dictionary_with_wordnet(extended_religious_content_dict)


In [5]:
# Step 6: Define a function to check if lyrics contain words from a specific dictionary
def check_category(lyrics, dictionary):
    if pd.isna(lyrics):  # If the lyrics are missing (NaN), return 0 (no matches)
        return 0
    # Convert the lyrics to lowercase and split into words (basic tokenization)
    tokens = lyrics.lower().split()  
    # Return the count of words in the lyrics that match words in the given dictionary
    return sum(1 for word in tokens if word in dictionary)

# Step 7: Add new features to the DataFrame based on the extended dictionaries
# For each song, count how many words match the sexual content dictionary
lyrics_df['sexual_content_count'] = lyrics_df['lyrics'].apply(lambda x: check_category(x, extended_sexual_content_dict))
# Similarly, count the number of matches for racial content
lyrics_df['racial_content_count'] = lyrics_df['lyrics'].apply(lambda x: check_category(x, extended_racial_content_dict))
# And for religious content
lyrics_df['religious_content_count'] = lyrics_df['lyrics'].apply(lambda x: check_category(x, extended_religious_content_dict))

# Create binary features indicating whether the lyrics contain any words from these categories
# If the count of sexual content words is greater than 0, mark it as 1 (True), otherwise 0 (False)
lyrics_df['contains_sexual_content'] = lyrics_df['sexual_content_count'].apply(lambda x: 1 if x > 0 else 0)
# Do the same for racial content
lyrics_df['contains_racial_content'] = lyrics_df['racial_content_count'].apply(lambda x: 1 if x > 0 else 0)
# And for religious content
lyrics_df['contains_religious_content'] = lyrics_df['religious_content_count'].apply(lambda x: 1 if x > 0 else 0)


In [6]:
# Step 8: Save the updated DataFrame to a new CSV file
# This file will now include the new features based on the dictionaries we built
lyrics_df.to_csv('updated_billboard_lyrics_extended.csv', index=False)

# Display the first few rows of the updated DataFrame to check if everything looks correct
print(lyrics_df.head())

            artist_all          artist_base  rank                       song  \
0          percy faith          percy faith     1  theme from a summer place   
1           jim reeves           jim reeves     2           he'll have to go   
2  the everly brothers  the everly brothers     3              cathy's clown   
3       johnny preston       johnny preston     4               running bear   
4         mark dinning         mark dinning     5                 teen angel   

   year artist_featured                 song_clean         artist_clean  \
0  1960             NaN  theme from a summer place          percy faith   
1  1960             NaN            hell have to go           jim reeves   
2  1960             NaN               cathys clown  the everly brothers   
3  1960             NaN               running bear       johnny preston   
4  1960             NaN                 teen angel         mark dinning   

                                              lyrics  acousticness  