In [1]:
import pandas as pd 
import random
import re

In [2]:
finalSet = None

# reducing the filtered dataset to keep most popular songs
with pd.read_csv("../filtered_lyrics.csv", chunksize=10**6) as data:
    for chunk in data: 
        reduced = chunk[chunk["views"] >= 500]

        if finalSet is None:
            finalSet = reduced

        finalSet = pd.concat([reduced, finalSet])

In [3]:
# builds a query to the final dataframe based on the user's input 

def buildQuery(df, title, artist, genre, language): 
    if len(title.strip()) > 0: 
        if len(artist.strip()) > 0: 
            return df[(df["title"] == title) & (df["artist"] == artist) & (df["language"] == language)].drop_duplicates()
        else: 
            return df[(df["title"] == title) & (df["language"] == language)].drop_duplicates()
    elif len(artist.strip()) > 0: 
        return df[(df["artist"] == artist) & (df["tag"] == genre) & (df["language"] == language)]
    else: 
        return df[(df["tag"] == genre) & (df["language"] == language)]

In [4]:
# split the lyrics of the song into sentences (with word-based separation) and counts its total words

def splitLyrics(lyrics): 
    divison = lyrics.split("\n")
    sentences = []
    totalWords = 0

    for sentence in divison:
        if len(sentence) <= 0 or sentence[0] == "[": 
            continue

        words = re.findall(r'\w+[-\'’]*\w+[\'’]*|\S{1}', sentence)
        totalWords += len(words)

        sentences.append(words)

    return sentences, totalWords

In [5]:
# Remember to change the language and genre variables! They are hardcoded due to its low variability in value

songIds = []
wordCount = 0

with open("test_r&B_lyrics_es.conll", "w") as f: 
    while wordCount < 2500:  
        title = str(input("Type a song title to request it specifically...")) 
        artist = str(input("Insert an artist if you really want to..."))
        language = "es" # en & es
        genre = "rb" # rb, rap & pop

        query = buildQuery(finalSet, title, artist, genre, language)
        
        if query.shape[0] > 1:
            singleSong = query.sample(1)
            songData = list(singleSong[["id", "lyrics"]].values[0]) # extracting only id and lyrics from the built query for convenience
        else:
             songData = list(query[["id", "lyrics"]].values[0]) # extracting only id and lyrics from the built query for convenience
        
        songId, lyrics = songData[0], songData[1]

        if songId not in songIds: 
            songIds.append(songId)
            allSentences, newWords = splitLyrics(lyrics)

            for s in allSentences:
                    f.write("\t_\n".join(s))
                    f.write("\t_\n\n")

            wordCount += newWords
            print(wordCount)

439
809
1086
1454
2034
2434
2883


In [6]:
# see full data of extracted songs
extractedSongs = finalSet["id"].isin(songIds)

# pandas does a weird thing sometimes adding rows twice here (even when using id which is unique), thefore .drop_duplicates() kicks in
finalSet[extractedSongs].drop_duplicates() 

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
2272993,Se Me Olvida,rb,Yoshi,2020,4856,"{""Jesse Baez""}","[Letra de ""Se Me Olvida"" ft. Jesse Baez]\n\n[I...",6139755,es,es,es
2513312,COMO CUANDO,rb,Feid,2021,2971,{},"[Letra de ""COMO CUANDO""]\n\n[Intro]\nUy, uh\nA...",6891977,es,es,es
1359878,Otra Noche en Miami,rb,Bad Bunny,2018,276854,{},"[Letra de ""Otra Noche en Miami""]\n\n[Intro]\nY...",3424254,es,es,es
1574288,Donde Estuviera,rb,Melymel,2018,715,{},"[Letra de ""Donde Estuviera""]\n\n[Intro]\nYo qu...",4069440,es,es,es
1651269,11:11,rb,Legarda,2018,2154,{},"[Letra de ""11:11""]\n\n[Verso 1]\nTengo la cert...",4303784,es,es,es
1850767,Amor,rb,Bele,2019,25172,{Beéle},"[Letra de ""Amor""]\n\n[Coro]\nIgnórame todo lo ...",4909066,es,es,es
203766,Déjame Enseñarte,rb,Joantony,2015,716,"{""Myke Towers""}","(Intro)\n\n(Joan Antonio)\nTe puedo recordar, ...",718727,es,es,es
