In [3]:
!pip install -q transformers einops accelerate langchain bitsandbytes
!pip install sentencepiece



In [None]:
#My huggingFace credentials:
# 
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 

In [None]:
from langchain import PromptTemplate,  LLMChain
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer
import transformers
import torch
import re
import pandas as pd
import time
from tqdm import tqdm

In [None]:
#@title clean_text

def clean_text(text, remove_punct = False, remove_new_line = True):
    '''
    given a string of text, remove brackets, (remove puntuations, optional) and remove escape sequences

    @params:
        text : string
    @returns:
        string which is cleaned
    '''

    clean_text = re.sub(r'\([^)]*\)', '', text).strip()
    clean_text = re.sub(r'\{[^)]*\}', '', clean_text).strip()
    clean_text = re.sub(r'\[[^)]*\]', '', clean_text).strip()

    #remove punctuations using string.punctuation
    if remove_punct:
        clean_text = clean_text.translate(str.maketrans('', '', string.punctuation.replace("-", "")))

    #remove escape sequences
    if remove_new_line:
      clean_text = clean_text.replace('\n', ' ')
    clean_text = clean_text.replace('\t', ' ')
    clean_text = clean_text.replace('\r', ' ')
    clean_text = clean_text.replace('\x0c', ' ')

    #remove extra spaces
    clean_text = re.sub(' +', ' ', clean_text).strip()

    return clean_text

In [None]:
#@title model declaration
model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation", #task
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1000,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

In [None]:
llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})

In [None]:
DATA_PATH = "/content/drive/MyDrive/MusicProject/data_clean.csv"
OUTPUT_PATH = "/content/drive/MyDrive/MusicProject/notebooks/output/"

df = pd.read_csv(DATA_PATH)

In [None]:
print(df.shape)
df_en = df[df['language'] == 'en']
print(df_en.shape)

(189866, 11)
(129306, 11)


In [None]:
sample_song = df_en[(df_en['artist'] == 'Taylor Swift') & (df_en['title'] == 'I Knew You Were Trouble.')]
sample_song_lyrics = clean_text(sample_song["lyrics_clean"].values[0] , remove_new_line = False)
sample_song_title = sample_song["title"].values[0]
sample_song_artist = sample_song["artist"].values[0]

sample_params = {
    "singer_name" : sample_song_artist,
    "song_name" : sample_song_title,
    "song_lyrics" : sample_song_lyrics,
}


In [None]:
pop_famous_artists = ["Taylor Swift", "Selena Gomez", "Dua Lipa", "Ariana Grande", "Justin Bieber", "Billie Eilish",
                      "Lady Gaga", "Adele" , "Britney Spears", "Katy Perry" , "Ed Sheeran", "Rihanna", "Bruno Mars",
                      "Demi Lovato", "Miley Cyrus", "Jennifer Lopez", "Charlie Puth"]

# get the rows in which the artist is in the list above and tag is pop
# df_pop_famous = df_en[(df_en['artist'].isin(pop_famous_artists)) & (df_en['tag'] == 'pop')]
# df_pop_famous.shape

df_pop_not_famous = df_en[(df_en['tag'] == 'pop') & (~df_en['artist'].isin(pop_famous_artists))]
df_pop_not_famous.shape


(32825, 11)

In [None]:
template = """
              You are a music analyst. You analyze the song lyrics and then write what the song is about.
              In this particular case, write the desciption of the song ```{song_name}``` by ```{singer_name}```
              The output description should be around 150 words.
              Do not mention the name of artist or the name of the song in the output description. Just directly explain what the song is about.

              The lyrics of the song are:
              ############## Lyrics start ##############
              ```{song_lyrics}```
              ############## Lyrics end ##############

              The output description should be around 150 words.
              Do not mention the name of artist or the name of the song in the output description. Just directly explain what the song is about.

              Description:
              """

input_variables = ["singer_name", "song_name", "song_lyrics"]

prompt = PromptTemplate( template = template, input_variables = input_variables)

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
# iterate through df_en and generate the description for each song using the model llm_chain.run
# use params depending on the row that is being iterated
# create a new dataframe and store the description in it along with all the other columns in the original dataframe
# create a batch size of 20 and keep saving the dataframe in the file after every batch

new_df = pd.DataFrame(columns = df_pop_not_famous.columns.tolist() + ['description'])
batch_size = 20
batch_number = 1
total_time_taken = 0
actual_iteration_number = 0
# iterate through the whole dataframe df_en
for i in tqdm( range( 0, df_pop_not_famous.shape[0] ) ):
    # if i <= 1360 : #total batches done * 20
    #   batch_number = 69
    #   # check if the df_en.iloc[i]['artist'] contains one of the pop_famous_artists
        # if any( j in df_en.iloc[i]['artist'] for j in pop_famous_artists ):
          # print("\n found one")
          # print("i = ", i)
    #   continue


    if (i+1) % batch_size == 0:
        print("")
        print("Average time till now is ", total_time_taken / (actual_iteration_number + 1))
        # print("saving file for batch number : ", str(batch_number))
        new_df.to_csv(OUTPUT_PATH + str("data_clean_en_with_description_llma2_") + str(batch_number)+ "_popNotFamous" + str(".csv"))
        # new_df.to_csv("data/temp.csv")
        new_df = pd.DataFrame(columns = df_pop_not_famous.columns.tolist() + ['description'])
        batch_number += 1

    else:
        start_time = time.time()
        # create params dictionary using the columns artist, title, lyrics_clean
        params = {
            "singer_name" : df_pop_not_famous.iloc[i]['artist'],
            "song_name" : df_pop_not_famous.iloc[i]['title'],
            "song_lyrics" : df_pop_not_famous.iloc[i]['lyrics_clean']
        }
        model_output_description = llm_chain.run(params)
        model_output_description = clean_text (model_output_description)

        # generate the description using llm_chain.run(params)
        # model_output_description = llm_chain.run(params)
        # model_output_description= " description of the song"

        # add the description to the new_df along with all the other columns
        # new_df = new_df.append(df_en.iloc[i].append(pd.Series([model_output_description], index=['description'])), ignore_index=True)
        row_data = df_pop_not_famous.iloc[i].to_dict()
        row_data['description'] = model_output_description
        new_df = pd.concat([new_df, pd.DataFrame(row_data, index=[0])], ignore_index=True)


        end_time = time.time()
        total_time_taken += (end_time - start_time)
    actual_iteration_number += 1



  0%|          | 19/32825 [03:38<99:34:39, 10.93s/it] 


Average time till now is  10.89818263053894





Average time till now is  11.852512294054032


  0%|          | 40/32825 [07:55<108:12:04, 11.88s/it]


OutOfMemoryError: ignored

In [None]:
x = llm_chain.run(params)

In [None]:
clean_text(x)



In [None]:
text = """
They told him, "Don't you ever come around here
Don't wanna see your face, you better disappear"
The fire's in their eyes and their words are really clear
So beat it, just beat it (Ooh!)
You better run, you better do what you can (Ooh!)
Don't wanna see no blood, don't be a macho man (Ooh!)
You wanna be tough, better do what you can
So beat it, but you wanna be bad

Just beat it, beat it, beat it, beat it
No one wants to be defeated
Showin' how funky and strong is your fight
It doesn't matter who's wrong or right
Just beat it (Beat it)
Just beat it (Beat it)
Just beat it (Beat it)
Just beat it (Beat it, uh)

They're out to get you, better leave while you can
Don't wanna be a boy, you wanna be a man
You wanna stay alive, better do what you can
So beat it, just beat it (Ooh!)
You have to show them that you're really not scared (Ooh!)
You're playin' with your life, this ain't no truth or dare (Ooh!)
They'll kick you, then they'll beat you, then they'll tell you it's fair
So beat it, but you wanna be bad

Just beat it, beat it, beat it, beat it
No one wants to be defeated
Showin' how funky and strong is your fight
It doesn't matter who's wrong or right
Just beat it, beat it, beat it, beat it
No one wants to be defeated
Showin' how funky and strong is your fight
It doesn't matter who's wrong or right
Just beat it (Beat it, beat it, beat it)
Beat it (Beat it, beat it, ha, ha, ha, ha)
Beat it (Beat it, beat it)
Beat it (Beat it, beat it)

Beat it, beat it, beat it, beat it
No one wants to be defeated
Showin' how funky and strong is your fight
It doesn't matter who's wrong or right (Who's right)
Just beat it, beat it, beat it, beat it (Hoo-hoo!)
No one wants to be defeated (Oh, lord)
Showin' how funky (Hee-hee!) and strong is your fight (Hee-hee-hee!)
It doesn't matter who's wrong or right
Just beat it, beat it, beat it, beat it (Beat it!)
No one wants to be defeated (Oh-no!)
Showin' how funky (Hoo-hoo!) and strong is your fight (Hee-hee! Hoo!)
It doesn't matter who's wrong or right
Just beat it, beat it, beat it, beat it
No one wants to be defeated
Showin' how funky and strong is your fight
It doesn't matter who's wrong or right (Who's right)
Just beat it, beat it, beat it, beat it (Hoo-hoo!)
No one wants to be defeated
"""

x = llm_chain.run(text)

In [None]:
clean_text(x)

'The song "Beat It" by Michael Jackson is about standing up for oneself in the face of adversity. The lyrics describe a situation where someone is being bullied and told to leave, but the person refuses to back down. The song encourages the listener to show their strength and fight back against their oppressors. The lyrics also emphasize the importance of not being defeated and to keep on fighting.'