In [1]:
import pandas as pd 
import random
import re

In [2]:
finalSet = None

# reducing the filtered dataset to keep most popular songs
with pd.read_csv("../filtered_lyrics.csv", chunksize=10**6) as data:
    for chunk in data: 
        reduced = chunk[chunk["views"] > 50000]

        if finalSet is None:
            finalSet = reduced

        finalSet = pd.concat([reduced, finalSet])

In [None]:
# builds a query to the final dataframe based on the user's input 

def buildQuery(df, title, artist, genre): 
    if len(title.strip()) > 0: 
        
        return df[df["title"] == title]
    elif len(artist.strip()) > 0: 
        return df[(df["artist"] == artist) & (df["tag"] == genre)]
    else: 
        return df[df["tag"] == genre.strip()]

In [58]:
# split the lyrics of the song into sentences (with word-based separation) and counts its total words

def splitLyrics(lyrics): 
    divison = lyrics.split("\n")
    sentences = []
    totalWords = 0

    for sentence in divison:
        if len(sentence) <= 0 or sentence[0] == "[": 
            continue

        words = re.findall(r'\w+[-\'’]*\w+[\'’]*|\S{1}', sentence)
        totalWords += len(words)

        sentences.append(words)

    return sentences, totalWords

In [59]:
songIds = []
wordCount = 0

with open("formatted_lyrics.conllu", "w") as f: 
    while wordCount < 5000:  
        title = str(input("Type a song title to request it specifically...")) 
        artist = str(input("Insert an artist if you haven't already requested a specific song by title..."))
        genre = "rb" # rb, rap, pop...

        query = buildQuery(finalSet, title, artist, genre)

        songData = list(query[["id", "lyrics"]].values) # extracting only id and lyrics from the built query for convenience

        if len(songData) > 2:
            queryLength = query.shape[0]
            songData = songData[random.randint(0, queryLength)]

        songId, lyrics = songData[0], songData[1]

        songIds.append(songId)
        allSentences, newWords = splitLyrics(lyrics)

        for s in allSentences:
                f.write("\t_\n".join(s))
                f.write("\t_\n\n")

        wordCount += newWords

In [60]:
# see full data of extracted songs
extractedSongs = finalSet["id"].isin(songIds)

# pandas does a weird thing sometimes adding rows twice here (even when using id which is unique), thefore .drop_duplicates() kicks in
finalSet[extractedSongs].drop_duplicates() 

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
1666141,Girls Need Love Remix,rb,Summer Walker & Drake,2019,923216,{},[Intro: Summer Walker]\nHonestly\n\n[Verse 1: ...,4348897,en,en,en
1969980,Luv Is Art,rb,A Boogie wit da Hoodie,2020,106305,"{""Lil Uzi Vert""}",[Intro: A Boogie wit da Hoodie & Lil Uzi Vert]...,5252999,en,en,en
2253,No Scrubs With Rap,rb,TLC,1999,1744368,"{""Lisa \\\""Left Eye\\\"" Lopes""}",[Intro: Chilli]\nOh-oh\n\n[Verse 1: Chilli & L...,2365,en,en,en
61622,Scream,rb,Usher,2012,73484,{},"[Intro]\nUsher, baby\nYeah, yeah\nDid it again...",72527,en,en,en
65551,Sierra Leone,rb,Frank Ocean,2012,554844,{},[Verse 1]\n(We're spendin' too much time alone...,78956,en,en,en
86193,I Should Have Cheated,rb,Keyshia Cole,2005,55055,{},"[Intro]\nOoh, ooh, oh\nBaby\n\n[Verse 1]\nFirs...",118692,en,en,en
142520,Pretty Thoughts,rb,Alina Baraz & Galimatias,2013,70464,{},"[Verse 1]\nYou looked at me with certainty\nI,...",371173,en,en,en
164263,Touchin Lovin,rb,Trey Songz,2014,374994,"{""Nicki Minaj""}","[Intro: Nicki Minaj & Trey Songz]\nOoh, woo, y...",459894,en,en,en
174848,Fuck Em Only We Know,rb,BANKS,2014,77179,{},"[Verse 1]\nI don't mind, don't mind if everybo...",507544,en,en,en
777181,Stars,rb,Nina Simone,2011,62537,{},"[Chorus]\nStars, they come and go\nThey come f...",1584195,en,en,en
