In [1]:
# Import required libraries
import pandas as pd
from gensim.models import KeyedVectors
import nltk
from nltk.corpus import wordnet

# Ensure nltk resources are downloaded
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/armanenginsucu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Step 1: Load your data files
lyrics_df = pd.read_csv('billboard-lyrics-spotify.csv')
profanity_df = pd.read_csv('profanity_en.csv')

# Display the first few rows of both datasets
print("Lyrics Data:")
print(lyrics_df.head())

print("\nProfanity Data:")
print(profanity_df.head())


Lyrics Data:
            artist_all          artist_base  rank                       song  \
0          percy faith          percy faith     1  theme from a summer place   
1           jim reeves           jim reeves     2           he'll have to go   
2  the everly brothers  the everly brothers     3              cathy's clown   
3       johnny preston       johnny preston     4               running bear   
4         mark dinning         mark dinning     5                 teen angel   

   year artist_featured                 song_clean         artist_clean  \
0  1960             NaN  theme from a summer place          percy faith   
1  1960             NaN            hell have to go           jim reeves   
2  1960             NaN               cathys clown  the everly brothers   
3  1960             NaN               running bear       johnny preston   
4  1960             NaN                 teen angel         mark dinning   

                                              lyrics  a

In [3]:
# Step 2: Build the dictionaries from 'profanity_en.csv'
def build_dictionaries(profanity_df):
    sexual_content_dict = set()
    racial_content_dict = set()
    religious_content_dict = set()

    # Safely handle cases where some columns might be NaN or missing
    for _, row in profanity_df.iterrows():
        # Use pd.notna() to ensure the category columns are not NaN
        category_1 = row['category_1'] if pd.notna(row['category_1']) else ''
        category_2 = row['category_2'] if pd.notna(row['category_2']) else ''
        category_3 = row['category_3'] if pd.notna(row['category_3']) else ''

        # Check if the word belongs to sexual content category
        if 'sexual' in category_1 or 'sexual' in category_2 or 'sexual' in category_3:
            sexual_content_dict.add(row['canonical_form_1'])
            if pd.notna(row['canonical_form_2']):
                sexual_content_dict.add(row['canonical_form_2'])
            if pd.notna(row['canonical_form_3']):
                sexual_content_dict.add(row['canonical_form_3'])

        # Check if the word belongs to racial content category
        if 'racial' in category_1 or 'racial' in category_2 or 'racial' in category_3:
            racial_content_dict.add(row['canonical_form_1'])
            if pd.notna(row['canonical_form_2']):
                racial_content_dict.add(row['canonical_form_2'])
            if pd.notna(row['canonical_form_3']):
                racial_content_dict.add(row['canonical_form_3'])

        # Check if the word belongs to religious content category
        if 'religious' in category_1 or 'religious' in category_2 or 'religious' in category_3:
            religious_content_dict.add(row['canonical_form_1'])
            if pd.notna(row['canonical_form_2']):
                religious_content_dict.add(row['canonical_form_2'])
            if pd.notna(row['canonical_form_3']):
                religious_content_dict.add(row['canonical_form_3'])

    return sexual_content_dict, racial_content_dict, religious_content_dict

# Build the dictionaries
sexual_content_dict, racial_content_dict, religious_content_dict = build_dictionaries(profanity_df)


In [4]:
# Step 3: Extend dictionaries using Word2Vec (Pre-trained Model)
# Load pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

def find_synonyms_word2vec(word, model, topn=10):
    try:
        synonyms = model.most_similar(word, topn=topn)
        return [syn for syn, score in synonyms]
    except KeyError:
        return []

def extend_dictionary_with_word2vec(dictionary, model):
    extended_dict = set(dictionary)
    for word in dictionary:
        synonyms = find_synonyms_word2vec(word, model)
        extended_dict.update(synonyms)
    return extended_dict

# Step 4: Extend dictionaries using WordNet (NLTK)
def find_wordnet_synonyms(word):
    synonyms = set()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name())  # Add lemma (synonym)
    return synonyms

def extend_dictionary_with_wordnet(dictionary):
    extended_dict = set(dictionary)
    for word in dictionary:
        synonyms = find_wordnet_synonyms(word)
        extended_dict.update(synonyms)
    return extended_dict

# Extend each dictionary with both Word2Vec and WordNet
extended_sexual_content_dict = extend_dictionary_with_word2vec(sexual_content_dict, word2vec_model)
extended_sexual_content_dict = extend_dictionary_with_wordnet(extended_sexual_content_dict)

extended_racial_content_dict = extend_dictionary_with_word2vec(racial_content_dict, word2vec_model)
extended_racial_content_dict = extend_dictionary_with_wordnet(extended_racial_content_dict)

extended_religious_content_dict = extend_dictionary_with_word2vec(religious_content_dict, word2vec_model)
extended_religious_content_dict = extend_dictionary_with_wordnet(extended_religious_content_dict)


In [5]:
# Step 6: Define a function to check lyrics against the extended dictionaries
def check_category(lyrics, dictionary):
    if pd.isna(lyrics):  # Handle NaN values
        return 0
    tokens = lyrics.lower().split()  # Simple tokenization by splitting words
    return sum(1 for word in tokens if word in dictionary)

# Step 7: Add new features to the lyrics DataFrame based on the extended dictionaries
lyrics_df['sexual_content_count'] = lyrics_df['lyrics'].apply(lambda x: check_category(x, extended_sexual_content_dict))
lyrics_df['racial_content_count'] = lyrics_df['lyrics'].apply(lambda x: check_category(x, extended_racial_content_dict))
lyrics_df['religious_content_count'] = lyrics_df['lyrics'].apply(lambda x: check_category(x, extended_religious_content_dict))

# Add binary features for presence of content
lyrics_df['contains_sexual_content'] = lyrics_df['sexual_content_count'].apply(lambda x: 1 if x > 0 else 0)
lyrics_df['contains_racial_content'] = lyrics_df['racial_content_count'].apply(lambda x: 1 if x > 0 else 0)
lyrics_df['contains_religious_content'] = lyrics_df['religious_content_count'].apply(lambda x: 1 if x > 0 else 0)


In [6]:
# Step 8: Save the updated dataset with the extended dictionaries applied
lyrics_df.to_csv('updated_billboard_lyrics_extended.csv', index=False)

# Display the first few rows of the updated dataset
print(lyrics_df.head())


            artist_all          artist_base  rank                       song  \
0          percy faith          percy faith     1  theme from a summer place   
1           jim reeves           jim reeves     2           he'll have to go   
2  the everly brothers  the everly brothers     3              cathy's clown   
3       johnny preston       johnny preston     4               running bear   
4         mark dinning         mark dinning     5                 teen angel   

   year artist_featured                 song_clean         artist_clean  \
0  1960             NaN  theme from a summer place          percy faith   
1  1960             NaN            hell have to go           jim reeves   
2  1960             NaN               cathys clown  the everly brothers   
3  1960             NaN               running bear       johnny preston   
4  1960             NaN                 teen angel         mark dinning   

                                              lyrics  acousticness  