In [1]:
import pandas as pd

In [102]:
CSV_FILEPATH = "data/data_clean_en_3.csv"
df = pd.read_csv(CSV_FILEPATH)
df.shape

(2558, 9)

In [101]:
pre_prompt="""You are a English song Lyricist. Famous singers come to you with descriptions of the kind of song they want you to write.
They also give you some examples of the song lyrics based on the description. 
Output only the lyrics of the song that you should write. do not output anything else.
The avg number of words in lyrics should be around 500-600 and it should have some chorus and verses.

Format of the examples is:
-- Description: <description of the song>
-- Lyrics : <lyrics of the song>

----------------------------------- EXAMPLES START -----------------------------------
-- Description 1: ```{desc_1}``` 
-- Lyrics 1: ```{lyrics_1}``` 

-- Description 2: ```{desc_2}``` 
-- Lyrics 2: ```{lyrics_2}``` 

-- Description 3: ```{desc_3}``` 
-- Lyrics 3: ```{lyrics_3}``` 

-- Description 4: ```{desc_4}``` 
-- Lyrics 4: ```{lyrics_4}``` 

-- Description 5: ```{desc_5}``` 
-- Lyrics 5: ```{lyrics_5}``` 

----------------------------------- EXAMPLES END -----------------------------------

Note that the avg number of words in lyrics should be around 500-600 and it should have some repeating chorus and verses. 

-- Description: ```{desc_user}``` 

-- Lyrics : 

"""

In [97]:
#@title QA_Pipeline_TfIdf { form-width: "20%" }

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np
import sys



class QA_Pipeline_TfIdf:

    def __init__(self):
        pass        

    def normalizeData(self,data,isNormalizeAgainstMax=False):
        if sum(data)==0:
            return data
        if isNormalizeAgainstMax:
          return [float(i)/max(data) for i in data]  # to normalize against max
        else:
          return [float(i)/sum(data) for i in data]  # to normalize to make sum = 1
        

    def create_tfidf_features(self,data, max_features=5000, max_df=0.85, min_df=2):
        """ Creates a tf-idf matrix for the `data` using sklearn. """
        tfidf_vectorizor = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word',
                                        stop_words='english', ngram_range=(1, 1), max_features=max_features,
                                        norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
                                        max_df=max_df, min_df=min_df)
        X = tfidf_vectorizor.fit_transform(data)
        print('tfidf matrix successfully created.')
        return X, tfidf_vectorizor

    def calculate_similarity( self,query, top_k=20):
        """ Vectorizes the `query` via `vectorizer` and calculates the cosine similarity of
        the `query` and `allDocuments` (all the documents) and returns the `top_k` similar documents."""
        
        # Vectorize the query to the same length as documents
        query_vec = self.vectorizer.transform(query)
        # Compute the cosine similarity between query_vec and all the documents
        cosine_similarities = cosine_similarity(self.tfidfTransformed_docs,query_vec).flatten()

        # Sort the similar documents from the most similar to less similar and return the indices
        most_similar_doc_indices = np.argsort(cosine_similarities, axis=0)[:-top_k-1:-1]

        # Sort the similar documents from the most similar to less similar and return the scores
        cosine_similarities = np.sort(cosine_similarities)[:-top_k-1:-1]

        # #normalize scores
        cosine_similarities = self.normalizeData(cosine_similarities,isNormalizeAgainstMax=True)
        return most_similar_doc_indices, cosine_similarities

    def RunTfIdf(self,question,top_n=20, getDocuments=False,data=None):
        '''
        given a question find the top_n most similar documents.
        if document text is also needed then pass getDocuments=True and pass the data which was originally passed to createTfIdfTable
        '''
        top_idx,cosine_similarities = self.calculate_similarity( [question],top_k=top_n)
        
        if getDocuments:
            if data==None:
                sys.exit('data to be returned but no data provided. data is n')
            retData = [data[i] for i in top_idx]
            return top_idx,cosine_similarities,retData 
        else:
            return top_idx,cosine_similarities

    def createTfIdfTable(self,data,maxFeatures=10000):
        self.tfidfTransformed_docs,self.vectorizer = self.create_tfidf_features( data  ,max_features=maxFeatures)
        # features = vectorizer.get_feature_names()

    def get_nearest_songs(self, data_df, text, top_n=5):
        '''
        given a string text, fine the nearest songs descriptions from the data
        return a list of descriptions and lyrics
        '''

        desc_data  = data_df['description'].tolist()


        self.createTfIdfTable(desc_data,maxFeatures=10000)

        top_idx,cosine_similarities,retData  = self.RunTfIdf(text, top_n=top_n, getDocuments=True, data=desc_data)

        # get the descriptions of top_idx from df
        desc = []
        lyrics = []
        titles = []
        ids = []

        df_titles = data_df['title'].tolist()
        df_ids = data_df['id'].tolist()
        df_desc = data_df['description'].tolist()
        df_lyrics = data_df['lyrics_clean_with_newline'].tolist()


        for i in top_idx:
            desc.append(df_desc[i])
            lyrics.append(df_lyrics[i])
            titles.append(df_titles[i])
            ids.append(df_ids[i])

        return ids, titles, desc, lyrics
    


In [98]:
eminem_df = df[df['artist']=='Eminem']
eminem_desc = eminem_df['description'].tolist()
eminem_lyrics = eminem_df['lyrics_clean_with_newline'].tolist()

    

In [99]:
tfidf = QA_Pipeline_TfIdf()

In [100]:
# add the question to the pre_prompt
text = "You are a liar. you lied to me. you lied to my kid. you are dead to me. I will never forget what you did. I will never forgive you."
ids, titles, desc, lyrics = tfidf.get_nearest_songs(eminem_df, text)

tfidf matrix successfully created.


In [None]:
params = {}
params["desc_1"] = desc[0]
params["desc_2"] = desc[1]
params["desc_3"] = desc[2]
params["desc_4"] = desc[3]
params["desc_5"] = desc[4]
params["lyrics_1"] = lyrics[0]
params["lyrics_2"] = lyrics[1]
params["lyrics_3"] = lyrics[2]
params["lyrics_4"] = lyrics[3]
params["lyrics_5"] = lyrics[4]

params["desc_user"] = text


In [None]:
template = pre_prompt
input_variables = [desc_1, lyrics_1, desc_2, lyrics_2, desc_3, lyrics_3, desc_4, lyrics_4, desc_5, lyrics_5, desc_user] # parameters. for tempalte

In [None]:
#@title generate_lyrics function
def generate_lyrics(desc_user, data_df, prompt_template, input_variables, specific_artist=None ):
    '''
        Given the user query/descipriton of the lyrics, return the lyrics of the song
    '''


    template = prompt_template
    input_variables = input_variables



    if specific_artist is None:
        sub_df = data_df

    else:
        # repalce "Format of the examples is:" with "the Lyrics should be in the style of <artist_name>." + "\n Format of the examples is:"
        temp = "Format of the examples is:"
        template = template.replace(temp, "the Lyrics should be in the style of " + specific_artist + ".\n" + temp)

        try:
            sub_df = data_df[data_df['artist']==specific_artist]
            # if this artist has less then 5 entries then get the whole df
            if sub_df.shape[0] < 5:
                print("This artist has less than 5 songs. taking the whole data")
                sub_df = data_df 
        except:
            print("Artist wasnt found. taking the whole data")
            sub_df = data_df
    

    llm_chain = initialize_model(template, input_variables)


    tfidf = QA_Pipeline_TfIdf()

    ids, titles, desc, lyrics = tfidf.get_nearest_songs(sub_df, desc_user)

    params = {}
    params["desc_1"] = desc[0]
    params["desc_2"] = desc[1]
    params["desc_3"] = desc[2]
    params["desc_4"] = desc[3]
    params["desc_5"] = desc[4]
    params["lyrics_1"] = lyrics[0]
    params["lyrics_2"] = lyrics[1]
    params["lyrics_3"] = lyrics[2]
    params["lyrics_4"] = lyrics[3]
    params["lyrics_5"] = lyrics[4]

    params["desc_user"] = desc_user

    
    model_output = llm_chain.run(params)

    return model_output

In [None]:
taylor_df = df[df['artist']=='Taylor Swift']

desc_user = "You are a liar. you lied to me. you lied to my kid. you are dead to me. I will never forget what you did. I will never forgive you."

tfidf = QA_Pipeline_TfIdf()

ids, titles, desc, lyrics = tfidf.get_nearest_songs(taylor_df, desc_user)

params = {}
params["desc_1"] = desc[0]
params["desc_2"] = desc[1]
params["desc_3"] = desc[2]
params["desc_4"] = desc[3]
params["desc_5"] = desc[4]
params["lyrics_1"] = lyrics[0]
params["lyrics_2"] = lyrics[1]
params["lyrics_3"] = lyrics[2]
params["lyrics_4"] = lyrics[3]
params["lyrics_5"] = lyrics[4]

params["desc_user"] = desc_user


model_output = llm_chain.run(params)



tfidf matrix successfully created.


In [76]:
# count the number of words in pre prompt
len(pre_prompt.split())

91

In [77]:
# add each desc and lyrics to the pre_prompt
for i in range(len(ids)):
    pre_prompt += "-- Description: " + desc[i] + "\n"
    pre_prompt += "-- Lyrics: " + lyrics[i] + "\n\n"

In [78]:
# count the number of words in pre prompt
len(pre_prompt.split())

5034

In [79]:
desc[0]

'The song "Rock Bottom" explores the struggles and hardships of a person who is living in poverty and facing constant financial difficulties. The lyrics depict a life filled with empty promises, broken dreams, and a constant feeling of hopelessness. The protagonist is trapped in a cycle of dead-end jobs with low pay, constantly being hired and fired, and living in a house without basic necessities. The song also touches upon the desperation and frustration that arises from wanting a better life but feeling unable to attain it. The lyrics convey a sense of anger and sadness, as the protagonist contemplates the idea of resorting to criminal activities to survive. Overall, "Rock Bottom" portrays the emotional turmoil and despair that can arise from living in poverty and the desire for a better life.'

In [80]:
# add the question to the pre_prompt
pre_prompt += "\n\n ------------------------------ EXAMPLES END ------------------------------  \n Now the user's actual query. Note that the avg number of words in lyrics should be around 500-600 and it should have some chorus and verses. \n "

pre_prompt += "-- Description: " + text + "\n"
pre_prompt += "-- Lyrics: " + " " + "\n\n"

In [81]:
# count the number of words in pre prompt
len(pre_prompt.split())

5098

In [82]:
# save text to txt file  pre_prompt
output_filepath = "temp_preprompt.txt"

# 3. Open the file in write mode and save the string
try:
    with open(output_filepath, 'w') as file:
        file.write(pre_prompt)
    print("String saved to", output_filepath)
except IOError as e:
    print("An error occurred while saving the file:", str(e))

String saved to temp_preprompt.txt


In [114]:
df[df['artist']=='Eric Clapton'].shape


(0, 9)