In [1]:
# Data is obtained from https://www.kaggle.com/datasets/ishikajohari/taylor-swift-all-lyrics-30-albums

In [1]:
import pandas as pd
import os
import streamlit as st
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate

In [2]:
def load_data(filename):
    df = pd.read_csv(filename)
    return df

# Data cleanning 

In [3]:
def filter_albums(df):
    substring = ['Deluxe','Disney', 'Live','Stadium','delux','International','Piano'\
                 , 'Version', 'anthology', '3am', 'Dawn',"Cruelest",'Japanese','Albatross',\
                 'Remixes','dropped','Exclusive','digitally','Bolter','Black','Afraid','Cassandra'\
                 ,'sweetest','Carolina','ladies','Chapter','ANTHOLOGY','Night']
    pattern = '|'.join(substring)

    filter = df['Albums'].str.contains(pattern) 
    filtered_df = df[~filter]
    return filtered_df

In [4]:
datapath = '/Users/suchitakulkarni/Dropbox/private/job_applications/industry/GenAI_course/Taylor_lyrics/data/'

In [5]:
df = load_data(os.path.join(datapath,'Albums.csv'))

In [6]:
df.head()

Unnamed: 0,ID,Albums
0,758025,Speak Now (Taylor’s Version)
1,1040217,Midnights (The Late Night Edition)
2,1040211,Midnights (The Til Dawn Edition)
3,1027134,folklore: the long pond studio sessions (Recor...
4,1013719,The More Red (Taylor’s Version) Chapter


In [7]:
print(filter_albums(df)['Albums'])

10                        Midnights
30                     Taylor Swift
31      Fearless (Platinum Edition)
33                              Red
34                             1989
35                       Reputation
36                            Lover
37                         Folklore
38                         Evermore
39                        Speak Now
55    THE TORTURED POETS DEPARTMENT
Name: Albums, dtype: object


## Lyrical retrieval 

In [8]:
song_text = []
song_names_list = []
i = 0
album_song_df = pd.DataFrame(columns = ['Album', 'Song_Name'])
for an_album in filter_albums(df)['Albums'].tolist():
    album_path = os.path.join(datapath,'Albums', an_album)
    if os.path.exists(album_path): 
        song_names = os.listdir(album_path)
        
        for name in song_names:
            album_song_df.loc[i] = [an_album, name.replace(".txt", '')]
            song_names_list.append(name.replace(".txt", ''))
            song_path = os.path.join(album_path,name)
            
            f = open(song_path, "r")
            s = f.read()
            s_begin = s[s.find('Lyrics'):]
            s_end = s_begin[:s_begin.find('Embed')]
            s_cleaned = s_end[:-2]    
            song_text.append(s_cleaned)
            i += 1
print(album_song_df)
album_song_df.to_csv('album_songnames.csv')

         Album           Song_Name
0    Midnights        MidnightRain
1    Midnights              Maroon
2    Midnights           Labyrinth
3    Midnights  YoureOnYourOwn_Kid
4    Midnights      SnowOnTheBeach
..         ...                 ...
109   Evermore      longstoryshort
110   Evermore            goldrush
111   Evermore      nobody_nocrime
112   Evermore   champagneproblems
113   Evermore         coneyisland

[114 rows x 2 columns]


# Testing GenAI concepts

### Recommendation using sentence embeddings 

In [9]:
@st.cache_resource
def load_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

@st.cache_data
def get_embeddings(texts):
    model = load_model()
    return model.encode(texts, show_progress_bar=True)



In [10]:
embeddings = get_embeddings(song_text)
np.save(f"Taylor_song_lyrics_embeddings.npy", embeddings)

2025-06-14 19:21:35.338 
  command:

    streamlit run /opt/anaconda3/envs/genai/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


Batches:   0%|          | 0/4 [00:00<?, ?it/s]



In [11]:
len(embeddings)

114

In [12]:
load_data = np.load(f"Taylor_song_lyrics_embeddings.npy")
load_data_frame = pd.read_csv("album_songnames.csv")
chosen_song = 0
similarities = cosine_similarity([load_data[chosen_song]], load_data)[0]
print(f"Chosen song is {load_data_frame['Song_Name'][chosen_song]} from album {load_data_frame['Album'][chosen_song]}")
for item in np.argsort(similarities)[::10]:
    print(f"Closest song is {load_data_frame['Song_Name'][item]} from album {load_data_frame['Album'][item]}")

Chosen song is MidnightRain from album Midnights
Closest song is marjorie from album Evermore
Closest song is ItsNicetoHaveaFriend from album Lover
Closest song is madwoman from album Folklore
Closest song is evermore from album Evermore
Closest song is dorothea from album Evermore
Closest song is tolerateit from album Evermore
Closest song is thisismetrying from album Folklore
Closest song is ThisLove from album 1989
Closest song is ivy from album Evermore
Closest song is Starlight from album Red
Closest song is IForgotThatYouExisted from album Lover
Closest song is Maroon from album Midnights


In [13]:
load_data = np.load(f"Taylor_song_lyrics_embeddings.npy")

chosen_song = 3
print(f"your chosen song is {song_names_list[chosen_song]}")
similarities = cosine_similarity([embeddings[chosen_song]], embeddings)[0]
for item in np.argsort(similarities)[::10]:
    print(f"Closest song is {album_song_df['Song_Name'][item]} from album {album_song_df['Album'][item]}")

your chosen song is YoureOnYourOwn_Kid
Closest song is Reputation_Prologue_ from album Reputation
Closest song is EverythingHasChanged from album Red
Closest song is Starlight from album Red
Closest song is SnowOnTheBeach from album Midnights
Closest song is MidnightRain from album Midnights
Closest song is willow from album Evermore
Closest song is DontBlameMe from album Reputation
Closest song is CorneliaStreet from album Lover
Closest song is Daylight from album Lover
Closest song is thisismetrying from album Folklore
Closest song is cardigan from album Folklore
Closest song is peace from album Folklore


## Converting song names in understandable dict

In [14]:
load_dotenv()

llm = ChatGroq(model="llama-3.3-70b-versatile") 

def few_shot_song_name_formatter(input_text):
    few_shot_prompt = PromptTemplate(
        input_variables=["input_text"],
        template="""
        Format the name of song into human understandble form. Provide your best guess.
        
        Examples:
        Text: MidnightRain
        Formatted name: Midnight Rain
        
        Text: SoItGoes___
        Formatted name: So It Goes

        Text: Maroon
        Formatted name: Maroon
        
        
        Now, classify the following:
        Text: {input_text}
        Formatted name:
        """
    )
    
    chain = few_shot_prompt | llm
    result = chain.invoke(input_text).content

    result = result.strip()
    if ':' in result:
        result = result.split(':')[1].strip()
    
    return result  

In [15]:
formatted_name = []
for sname in song_names_list: 
    result_name = few_shot_song_name_formatter(sname)
    print(result_name)
    formatted_name.append(result_name)

Midnight Rain
Maroon
Labyrinth
You're On Your Own Kid
Snow On The Beach
Question
Vigilante Shit
Karma
Formatted name
Sweet Nothing
Bejeweled
Formatted name
Anti Hero
Mastermind
Lavender Haze
Come Back Be Here
I Almost Do
Treacherous
The Last Time
All Too Well
State of Grace
Stay Stay Stay
Everything Has Changed
Starlight
Formatted name
The Lucky One
Holy Ground
WeAreNeverEverGettingBackTogether

Formatted name
Red
Sad Beautiful Tragic
Begin Again
I Knew You Were Trouble
22 (or Twenty-Two) 

Since "22" is a number, it can be either written as is or spelled out as "Twenty-Two". Without more context, it's difficult to determine which format is more suitable. However, in the context of song titles, it's common to see numbers written as is, so "22" is a reasonable choice.
Girl at Home
The Moment I Knew
Bad Blood
Blank Space
I Know Places
This Love
Out Of The Woods
Formatted name
Welcome to New York
All You Had to Do Was Stay
Shake It Off
Style
1989 Booklet
I Wish You Would
Clean
How You Get

In [16]:
import copy
formatted_name_safecopy = copy.deepcopy(formatted_name)

In [17]:
formatted_name_safecopy

['Midnight Rain',
 'Maroon',
 'Labyrinth',
 "You're On Your Own Kid",
 'Snow On The Beach',
 'Question',
 'Vigilante Shit',
 'Karma\nFormatted name',
 'Sweet Nothing',
 'Bejeweled\nFormatted name',
 'Anti Hero',
 'Mastermind',
 'Lavender Haze',
 'Come Back Be Here',
 'I Almost Do',
 'Treacherous',
 'The Last Time',
 'All Too Well',
 'State of Grace',
 'Stay Stay Stay',
 'Everything Has Changed',
 'Starlight\nFormatted name',
 'The Lucky One',
 'Holy Ground',
 'WeAreNeverEverGettingBackTogether\n\nFormatted name',
 'Red',
 'Sad Beautiful Tragic',
 'Begin Again',
 'I Knew You Were Trouble',
 '22 (or Twenty-Two) \n\nSince "22" is a number, it can be either written as is or spelled out as "Twenty-Two". Without more context, it\'s difficult to determine which format is more suitable. However, in the context of song titles, it\'s common to see numbers written as is, so "22" is a reasonable choice.',
 'Girl at Home',
 'The Moment I Knew',
 'Bad Blood',
 'Blank Space',
 'I Know Places',
 'This

In [18]:
for i in range(len(formatted_name)):
    if 'Formatted name' in formatted_name[i]: formatted_name[i] = formatted_name[i].replace('Formatted name', '').strip()
print(formatted_name)

['Midnight Rain', 'Maroon', 'Labyrinth', "You're On Your Own Kid", 'Snow On The Beach', 'Question', 'Vigilante Shit', 'Karma', 'Sweet Nothing', 'Bejeweled', 'Anti Hero', 'Mastermind', 'Lavender Haze', 'Come Back Be Here', 'I Almost Do', 'Treacherous', 'The Last Time', 'All Too Well', 'State of Grace', 'Stay Stay Stay', 'Everything Has Changed', 'Starlight', 'The Lucky One', 'Holy Ground', 'WeAreNeverEverGettingBackTogether', 'Red', 'Sad Beautiful Tragic', 'Begin Again', 'I Knew You Were Trouble', '22 (or Twenty-Two) \n\nSince "22" is a number, it can be either written as is or spelled out as "Twenty-Two". Without more context, it\'s difficult to determine which format is more suitable. However, in the context of song titles, it\'s common to see numbers written as is, so "22" is a reasonable choice.', 'Girl at Home', 'The Moment I Knew', 'Bad Blood', 'Blank Space', 'I Know Places', 'This Love', 'Out Of The Woods', '', 'Welcome to New York', 'All You Had to Do Was Stay', 'Shake It Off', 

In [19]:
len(formatted_name)

114

In [20]:
album_song_df['Song_Name']

0            MidnightRain
1                  Maroon
2               Labyrinth
3      YoureOnYourOwn_Kid
4          SnowOnTheBeach
              ...        
109        longstoryshort
110              goldrush
111        nobody_nocrime
112     champagneproblems
113           coneyisland
Name: Song_Name, Length: 114, dtype: object

In [21]:
album_song_df.insert(2, 'Formatted_name', formatted_name)

In [22]:
album_song_df.to_csv('song_names_formatted.csv')

### Sentiment analysis using few shot classification

This part uses prompting techniques discussed in the class. 

In [23]:
load_dotenv()

#llm = ChatGroq(model="llama-3.3-70b-versatile") 
llm = ChatGroq(model = "llama3-70b-8192")

def few_shot_sentiment_classification(input_text):
    few_shot_prompt = PromptTemplate(
        input_variables=["input_text"],
        template="""
        Classify the sentiment into primary and sub-category without any explanation. Use primary categories as Romantic, Non-Romantic. If primary category is Romantic use secondary categories as Heartbreak/Break-up, Romantic Optimism. If primary category is non-Romantic then use Introspection and Stories.
        
        Examples:
        Text: There I was again tonight Forcing laughter, faking smiles Same old tired, lonely place Walls of insincerity Shifting eyes and vacancy Vanished when I saw your face All I can say is it was enchanting to meet you
        Category: Romantic, Romantic Optimism
        
        Text: I don't like your little games Don't like your tilted stage The role you made me play Of the fool, no, I don't like you I don't like your perfect crime How you laugh when you lie You said the gun was mine Isn't cool, no, I don't like you (Oh)
        Category: Romantic, Heartbreak/Break-up
        
        Text: And I chose you The one I was dancing with In New York, no shoes Looked up at the sky and it was The burgundy on my t-shirt When you splashed your wine into me And how the blood rushed into my cheeks So scarlet, it was
        Category: Romantic, Heartbreak/Break-up
        
        Now, classify the following:
        Text: {input_text}
        Category:
        """
    )
    
    chain = few_shot_prompt | llm
    result = chain.invoke(input_text).content

    result = result.strip()
    if ':' in result:
        result = result.split(':')[1].strip()
    
    return result  

In [25]:
song_sentiment_primary = []
song_sentiment_secondary = []
song_text = []
song_names_list = []
for an_album in filter_albums(df)['Albums'].tolist():
    album_path = os.path.join(datapath,'Albums', an_album)
    if os.path.exists(album_path): 
        song_names = os.listdir(album_path)        
        for name in song_names:
            song_names_list.append(name.replace(".txt", ''))
            song_path = os.path.join(album_path,name)
            f = open(song_path, "r")
            test_text = f.read()
            sentiment_primary, sentiment_secondary = few_shot_sentiment_classification(test_text).split(',')
            print(f"{sentiment_primary, sentiment_secondary}\n")
            print('-------------------------')
            song_sentiment_primary.append(sentiment_primary)
            song_sentiment_secondary.append(sentiment_secondary)

('Romantic', ' Heartbreak/Break-up')

-------------------------
('Romantic', ' Heartbreak/Break-up')

-------------------------
('Romantic', ' Romantic Optimism')

-------------------------
('Romantic', ' Heartbreak/Break-up')

-------------------------
('Romantic', ' Romantic Optimism')

-------------------------
('Romantic', ' Heartbreak/Break-up')

-------------------------
('Non-Romantic', ' Introspection')

-------------------------
('Non-Romantic', ' Introspection')

-------------------------
('Romantic', ' Romantic Optimism')

-------------------------
('Romantic', ' Heartbreak/Break-up')

-------------------------
('Non-Romantic', ' Introspection')

-------------------------
('Romantic', ' Romantic Optimism')

-------------------------
('Romantic', ' Romantic Optimism')

-------------------------
('Romantic', ' Heartbreak/Break-up')

-------------------------
('Romantic', ' Heartbreak/Break-up')

-------------------------
('Romantic', ' Romantic Optimism')

--------------------

In [26]:
print(len(song_sentiment_primary))
print(len(song_sentiment_secondary))

114
114


In [27]:
album_song_df

Unnamed: 0,Album,Song_Name,Formatted_name
0,Midnights,MidnightRain,Midnight Rain
1,Midnights,Maroon,Maroon
2,Midnights,Labyrinth,Labyrinth
3,Midnights,YoureOnYourOwn_Kid,You're On Your Own Kid
4,Midnights,SnowOnTheBeach,Snow On The Beach
...,...,...,...
109,Evermore,longstoryshort,Long Story Short
110,Evermore,goldrush,Gold Rush
111,Evermore,nobody_nocrime,Nobody No Crime
112,Evermore,champagneproblems,Champagne Problems


In [28]:
album_song_df.insert(2, 'Song_sentiment_primary', song_sentiment_primary)
album_song_df.insert(3, 'Song_sentiment_secondary', song_sentiment_secondary)
album_song_df.to_csv('song_names_formatted_mod.csv')

In [29]:
print(album_song_df)

         Album           Song_Name Song_sentiment_primary  \
0    Midnights        MidnightRain               Romantic   
1    Midnights              Maroon               Romantic   
2    Midnights           Labyrinth               Romantic   
3    Midnights  YoureOnYourOwn_Kid               Romantic   
4    Midnights      SnowOnTheBeach               Romantic   
..         ...                 ...                    ...   
109   Evermore      longstoryshort               Romantic   
110   Evermore            goldrush               Romantic   
111   Evermore      nobody_nocrime           Non-Romantic   
112   Evermore   champagneproblems               Romantic   
113   Evermore         coneyisland               Romantic   

    Song_sentiment_secondary          Formatted_name  
0        Heartbreak/Break-up           Midnight Rain  
1        Heartbreak/Break-up                  Maroon  
2          Romantic Optimism               Labyrinth  
3        Heartbreak/Break-up  You're On Your Ow

In [30]:
album_song_df

Unnamed: 0,Album,Song_Name,Song_sentiment_primary,Song_sentiment_secondary,Formatted_name
0,Midnights,MidnightRain,Romantic,Heartbreak/Break-up,Midnight Rain
1,Midnights,Maroon,Romantic,Heartbreak/Break-up,Maroon
2,Midnights,Labyrinth,Romantic,Romantic Optimism,Labyrinth
3,Midnights,YoureOnYourOwn_Kid,Romantic,Heartbreak/Break-up,You're On Your Own Kid
4,Midnights,SnowOnTheBeach,Romantic,Romantic Optimism,Snow On The Beach
...,...,...,...,...,...
109,Evermore,longstoryshort,Romantic,Romantic Optimism,Long Story Short
110,Evermore,goldrush,Romantic,Heartbreak/Break-up,Gold Rush
111,Evermore,nobody_nocrime,Non-Romantic,Introspection,Nobody No Crime
112,Evermore,champagneproblems,Romantic,Heartbreak/Break-up,Champagne Problems


In [31]:
#album_song_df.insert(4, 'test_column', album_song_df['Song_sentiment'])

#album_song_df['test_column'] = album_song_df['test_column'].replace({'Confidence/Pride':'Confidence'})
#album_song_df['test_column'].unique()
#print(album_song_df.test_column.isin(['Confidence/Pride']))

KeyError: 'test_column'

#### Let's read the dataframes and combine them

In [33]:
df1 = pd.read_csv('song_names_formatted_mod.csv')

In [34]:
df2 = pd.read_csv('song_names_formatted.csv')
df_merged = pd.DataFrame(columns = ["Album", "Song_name", "primary_sentiment", "secondary_sentiment"])

df_merged["Album"] = df2['Album']
df_merged["Song_name"] = df2['Formatted_name']
df_merged["primary_sentiment"] = df1['Song_sentiment_primary']
df_merged["secondary_sentiment"] = df1['Song_sentiment_secondary']

print(df_merged)

         Album               Song_name primary_sentiment   secondary_sentiment
0    Midnights           Midnight Rain          Romantic   Heartbreak/Break-up
1    Midnights                  Maroon          Romantic   Heartbreak/Break-up
2    Midnights               Labyrinth          Romantic     Romantic Optimism
3    Midnights  You're On Your Own Kid          Romantic   Heartbreak/Break-up
4    Midnights       Snow On The Beach          Romantic     Romantic Optimism
..         ...                     ...               ...                   ...
109   Evermore        Long Story Short          Romantic     Romantic Optimism
110   Evermore               Gold Rush          Romantic   Heartbreak/Break-up
111   Evermore         Nobody No Crime      Non-Romantic         Introspection
112   Evermore      Champagne Problems          Romantic   Heartbreak/Break-up
113   Evermore            Coney Island          Romantic   Heartbreak/Break-up

[114 rows x 4 columns]


In [35]:
df_merged.to_csv('final_data.csv')