In [1]:
from google.colab import drive
drive.mount('/drive', force_remount=True)

%cd '/drive/MyDrive/Colab Notebooks/INF2190/INF2190 Final Project/'

Mounted at /drive
/drive/MyDrive/Colab Notebooks/INF2190/INF2190 Final Project


In [11]:
# Song titles: using nltk TF and TF IDF to look at frequent words in song titles and in comparison to other song titles

# load the libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from textblob import TextBlob
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer as sia
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# Download NLTK resources if not already available
nltk.download('punkt_tab')
nltk.download('stopwords')

# Initialize stop words
stop_words = set(stopwords.words('english'))

# Load dataset
lyrics = pd.read_csv("data/hot-100-unique-lyrics.csv")

lyrics = lyrics.drop(['last_week'], axis=1).dropna()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def create_tokens(lyric):
    blob = TextBlob(lyric)
    tokens = blob.words

    # remove stop words and numbers
    word_tokens = [token for token in tokens if token.isalpha()]
    filtered_tokens = [word for word in word_tokens if word not in stop_words]

    # remove stemmed words
    porterStemmer = nltk.PorterStemmer()
    words_stemmed = [porterStemmer.stem(word) for word in filtered_tokens]

    return ' '.join(words_stemmed)

def calculate_stop_words_ratio(lyric):
    blob = TextBlob(lyric)
    tokens = blob.words

    stop_words = set(stopwords.words())
    word_tokens = [token for token in tokens if token.isalpha()]
    stop_tokens = [word for word in word_tokens if word in stop_words]

    return len(stop_tokens) / len(word_tokens)


lyrics['lyrics_tokenized'] = lyrics['lyrics'].apply(create_tokens)
lyrics['stop_words_ratio'] = lyrics['lyrics'].apply(calculate_stop_words_ratio)
lyrics['title_tokenized'] = lyrics['title'].apply(create_tokens)

lyrics

Unnamed: 0,chart_week,current_week,title,performer,peak_pos,wks_on_chart,lyrics,lyrics_tokenized,stop_words_ratio,title_tokenized
0,1989-01-07,1,Every Rose Has Its Thorn,Poison,1,11,We both lie silently still in the dead of the ...,we lie silent still dead night although lie cl...,0.508961,everi rose ha it thorn
1,1989-01-07,74,Peek-A-Boo,Siouxsie & The Banshees,53,13,Creeping up the backstairs\nSlinking into dark...,creep backstair slink dark stall shapeless slu...,0.490196,
2,1989-01-07,73,"Don't Worry, Be Happy (From ""Cocktail"")",Bobby McFerrin,1,24,Here's a little song I wrote\nYou might want t...,here littl song i wrote you might want sing no...,0.441441,do worri be happi from cocktail
3,1989-01-07,72,"Hippy Hippy Shake (From ""Cocktail"")",The Georgia Satellites,45,12,"For goodness' sake\nI got the hippy, hippy sha...",for good sake i got hippi hippi shake yeah i g...,0.379562,hippi hippi shake from cocktail
4,1989-01-07,71,I Beg Your Pardon,Kon Kan,71,3,Here we go\n\nThere once was a time and there ...,here go there time way we someth goin dismay a...,0.611842,i beg your pardon
...,...,...,...,...,...,...,...,...,...,...
12995,2022-01-01,57,Oh My God,Adele,5,5,I ain't got too much time to spend\nBut I'll m...,i ai got much time spend but i make time show ...,0.609524,oh my god
12996,2022-01-01,75,Have Mercy,Chloe,28,15,"Murda on the beat, so it's not nice\nBooty so ...",murda beat nice booti big work lord merci work...,0.545455,have merci
12997,2022-01-01,56,Better Days,NEIKED X Mae Muller X Polo G,29,10,"I've been feeling lonely, I need someone to ho...",i feel lone i need someon hold come babi hold ...,0.579618,better day
12998,2022-01-01,55,Merry Christmas,Ed Sheeran & Elton John,55,3,Build the fire and gather 'round the trees\nFi...,build fire gather tree fill glass mayb come si...,0.586751,merri christma


In [10]:
# apply compound sentiment score
nltk.download('vader_lexicon')
analyzer = sia()

def compoundScore(text):
   scores = analyzer.polarity_scores(text)
   return scores['compound']

lyrics['sentiment_compound_score'] = lyrics['lyrics_tokenized'].apply(compoundScore)
lyrics

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,chart_week,current_week,title,performer,peak_pos,wks_on_chart,lyrics,lyrics_tokenized,stop_words_ratio,title_tokenized,sentiment_compound_score
0,1989-01-07,1,Every Rose Has Its Thorn,Poison,1,11,We both lie silently still in the dead of the ...,we lie silent still dead night although lie cl...,0.508961,everi rose ha it thorn,0.9604
1,1989-01-07,74,Peek-A-Boo,Siouxsie & The Banshees,53,13,Creeping up the backstairs\nSlinking into dark...,creep backstair slink dark stall shapeless slu...,0.490196,,-0.4215
2,1989-01-07,73,"Don't Worry, Be Happy (From ""Cocktail"")",Bobby McFerrin,1,24,Here's a little song I wrote\nYou might want t...,here littl song i wrote you might want sing no...,0.441441,do worri be happi from cocktail,0.8847
3,1989-01-07,72,"Hippy Hippy Shake (From ""Cocktail"")",The Georgia Satellites,45,12,"For goodness' sake\nI got the hippy, hippy sha...",for good sake i got hippi hippi shake yeah i g...,0.379562,hippi hippi shake from cocktail,0.9477
4,1989-01-07,71,I Beg Your Pardon,Kon Kan,71,3,Here we go\n\nThere once was a time and there ...,here go there time way we someth goin dismay a...,0.611842,i beg your pardon,0.9526
...,...,...,...,...,...,...,...,...,...,...,...
12995,2022-01-01,57,Oh My God,Adele,5,5,I ain't got too much time to spend\nBut I'll m...,i ai got much time spend but i make time show ...,0.609524,oh my god,0.9769
12996,2022-01-01,75,Have Mercy,Chloe,28,15,"Murda on the beat, so it's not nice\nBooty so ...",murda beat nice booti big work lord merci work...,0.545455,have merci,0.9960
12997,2022-01-01,56,Better Days,NEIKED X Mae Muller X Polo G,29,10,"I've been feeling lonely, I need someone to ho...",i feel lone i need someon hold come babi hold ...,0.579618,better day,0.9958
12998,2022-01-01,55,Merry Christmas,Ed Sheeran & Elton John,55,3,Build the fire and gather 'round the trees\nFi...,build fire gather tree fill glass mayb come si...,0.586751,merri christma,0.9979


In [17]:
# Initialize stop words and term frequency counter
stop_words = set(stopwords.words('english'))
term_frequency = Counter()

# Iterate over each row in the 'lyrics' column
for lyrics in lyrics['lyrics']:
    # Tokenize and clean up lyrics
    tokens = word_tokenize(lyrics.lower())  # Tokenize and lowercase
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove non-alphanumeric and stopwords

    # Update the term frequency counter
    term_frequency.update(filtered_tokens)

# Print the top 20 most common terms
print("Top 20 Term Frequency:", term_frequency.most_common(20))

Top 20 Term Frequency: [('like', 38983), ('yeah', 36552), ('got', 34854), ('know', 34852), ('love', 33672), ('na', 28865), ('get', 26611), ('oh', 26422), ('baby', 23762), ('go', 20140), ('let', 19216), ('one', 16732), ('ai', 16476), ('wan', 15980), ('ca', 15588), ('want', 15364), ('gon', 14853), ('make', 14828), ('see', 14691), ('never', 14661)]


In [20]:
# Initialize stop words and term frequency counter
stop_words = set(stopwords.words('english'))
term_frequency = Counter()

# Iterate over each row in the 'lyrics' column
for title in lyrics['title']:
    # Tokenize and clean up lyrics
    tokens = word_tokenize(title.lower())  # Tokenize and lowercase
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove non-alphanumeric and stopwords

    # Update the term frequency counter
    term_frequency.update(filtered_tokens)

# Print the top 20 most common terms
print("Top 20 Term Frequency:", term_frequency.most_common(20))

Top 20 Term Frequency: [('love', 636), ('like', 217), ('one', 165), ('get', 162), ('go', 159), ('girl', 151), ('na', 151), ('time', 139), ('heart', 130), ('back', 124), ('let', 118), ('night', 118), ('good', 116), ('know', 115), ('ca', 114), ('life', 114), ('got', 113), ('way', 113), ('baby', 111), ('want', 103)]


In [12]:
###### lyrics text analysis using the example code form course, but the output is strange and am unsure if an error

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english',
                        max_features=1000,  # Limit to top 1000 features
                        ngram_range=(1, 3))  # Use unigrams, bigrams, and trigrams

# Transform the 'text' column to TF-IDF features
transformed_text = tfidf.fit_transform(lyrics['lyrics_tokenized'])

# Get feature names
feature_names = tfidf.get_feature_names_out()

# Convert to DataFrame for better visualization
spotify_songs_df = pd.DataFrame(transformed_text.toarray(), columns=feature_names)

# Display the resulting DataFrame
print(spotify_songs_df.head())

   act  act like  actin  afraid   ah  ah ah  ah ah ah  ahead        ai  \
0  0.0       0.0    0.0     0.0  0.0    0.0       0.0    0.0  0.000000   
1  0.0       0.0    0.0     0.0  0.0    0.0       0.0    0.0  0.000000   
2  0.0       0.0    0.0     0.0  0.0    0.0       0.0    0.0  0.056419   
3  0.0       0.0    0.0     0.0  0.0    0.0       0.0    0.0  0.000000   
4  0.0       0.0    0.0     0.0  0.0    0.0       0.0    0.0  0.000000   

   ai gon  ...  yeah yeah  yeah yeah yeah  year  yesterday   yo  york  young  \
0     0.0  ...        0.0             0.0   0.0        0.0  0.0   0.0    0.0   
1     0.0  ...        0.0             0.0   0.0        0.0  0.0   0.0    0.0   
2     0.0  ...        0.0             0.0   0.0        0.0  0.0   0.0    0.0   
3     0.0  ...        0.0             0.0   0.0        0.0  0.0   0.0    0.0   
4     0.0  ...        0.0             0.0   0.0        0.0  0.0   0.0    0.0   

   yuh   作曲   作词  
0  0.0  0.0  0.0  
1  0.0  0.0  0.0  
2  0.0  0.0  0.0 

In [None]:
## code for the final data set which has year and song lyrics, to look at lyrics frequency count grouped by year

# Drop missing values in 'year' and 'lyrics' columns
spotify_lyrics = lyrics.dropna(subset=['year', 'lyrics'])

# Ensure year is treated as a string (or integer, depending on your dataset)
spotify_lyrics['year'] = spotify_lyrics['year'].astype(str)

# Initialize stop words and term frequency counter
stop_words = set(stopwords.words('english'))

# the term frequency dictionary lists for each year
yearly_term_frequencies = {}

# Iterate over each song's lyrics and group by year
for idx, row in spotify_lyrics.iterrows():
    year = row['year']
    lyrics = row['lyrics']

    tokens = word_tokenize(lyrics.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words]

    if year not in yearly_term_frequencies:
        yearly_term_frequencies[year] = Counter()

    # Update the term frequency counter for that year
    yearly_term_frequencies[year].update(filtered_tokens)

# Print term frequency for each year
for year, tf in yearly_term_frequencies.items():
    print(f"Term Frequency for year {year}:")
    print(tf)
    print("\n")