Pandas library

In [None]:
import pandas as pd

Load the csv file and then drop the songs in genres Not Available and Other

In [None]:
df = pd.read_csv('./data/lyrics_def.csv')
df = df[(df.genre != 'Not Available')]
df = df[(df.genre != 'Other')]
len(df)

216671

In [None]:
len(df[df.duplicated(subset='lyrics') == True])

13962

In [None]:
df = df.sort_values(by='year', ascending=True)
df = df.drop_duplicates(subset='lyrics', keep='first')
df = df.sort_index()
df = df.reset_index(drop=True)
df

Unnamed: 0,song,year,artist,genre,lyrics,language
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu...",en
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see...",en
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,en
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...",en
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po...",en
...,...,...,...,...,...,...
202704,who-am-i-drinking-tonight,2012,edens-edge,Country,"I gotta say\nBoy, after only just a couple of ...",en
202705,liar,2012,edens-edge,Country,I helped you find her diamond ring\nYou made m...,en
202706,last-supper,2012,edens-edge,Country,Look at the couple in the corner booth\nLooks ...,en
202707,christ-alone-live-in-studio,2012,edens-edge,Country,When I fly off this mortal earth\nAnd I'm meas...,en


In [None]:
len(df[df.duplicated(subset='lyrics') == True])

0

In [None]:
len(df)

202709

Songs with the same titles and potentially duplicates

In [None]:
song_occurrences = df.groupby('song')
song_occurrences_over_1 = song_occurrences.filter(lambda x: len(x) > 1)
duplicate_song_groups = song_occurrences_over_1.groupby('song').groups

In [None]:
len(song_occurrences.groups), len(duplicate_song_groups)

(143677, 20968)

Each element of the dictionary includes the indexes of the songs with that specific title

A list with the "duplicated" indexes for each song is created and it is added as a new column in the dataframe.

We use this method to avoid any issues with the pandas library.

In [None]:
duplicates_idx = list()

for idx, row in df.iterrows():
    curr_duplicates = []
    if row.song in duplicate_song_groups:
        curr_duplicates = duplicate_song_groups[row.song].tolist()
    duplicates_idx.append(curr_duplicates)

df['duplicate_idx'] = duplicates_idx

In [None]:
df[df.song == 'honesty']

Unnamed: 0,song,year,artist,genre,lyrics,language,duplicate_idx
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]"
8641,honesty,2013,billy-joel,Rock,If you search for tenderness\nIt isn't hard to...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]"
33382,honesty,2013,editors,Rock,Don't you want me to stay?\nMake a rocking hor...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]"
63543,honesty,2005,alex-parks,Pop,Your face when I try to explain\nI didn't thin...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]"
98959,honesty,2009,attack-attack,Rock,"Honesty didn't get me anywhere,\nI know cause ...",en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]"
109173,honesty,2006,77s,Rock,Alright\nTell me what I said\nYou better watch...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]"
119239,honesty,2006,alannah-myles,Rock,"Oh, did I strike the chord of honesty in you?\...",en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]"


In [None]:
df[df.song == 'slow-love']

Unnamed: 0,song,year,artist,genre,lyrics,language,duplicate_idx
8,slow-love,2009,beyonce-knowles,Pop,[Verse 1:]\nI read all of the magazines\nwhile...,en,"[8, 114832, 137184]"
114832,slow-love,2006,gino-vannelli,Pop,"I want some slow love, the kind that lasts for...",en,"[8, 114832, 137184]"
137184,slow-love,2009,brandy,Pop,[verse 1]\nI read of all the magazines while w...,en,"[8, 114832, 137184]"


### Tokenization

In [None]:
import nltk
from nltk import word_tokenize as tokenize

In [None]:
tokenizer = nltk.RegexpTokenizer(r"\w+")

In [None]:
lyrics_token = list()

for idx, row in df.iterrows():
    lyrics_token.append(tokenizer.tokenize(row['lyrics'].lower()))

In [None]:
df['nlp_lyrics'] = lyrics_token

Two songs are considered equivalent if at least 7 out of the first 10 tokens are the same. If so, we check for the oldest year among these songs.

In [None]:
min_year_list = list()
duplicated_list = [0] * len(df)
num_token = 10

for idx, row in df.iterrows():
    min_year = row.year
    
    for i in row.duplicate_idx:
        if row.duplicate_idx != []:
            if idx != i:
                c = 0
                for token in row.nlp_lyrics[:num_token]:
                    for token2 in df.loc[i, 'nlp_lyrics'][:num_token]:
                        if token == token2:
                            c += 1
                if (c >= num_token*0.7):
                    duplicated_list[idx] += 1
                    min_year = min(min_year, df.loc[i, 'year'])
    
    if duplicated_list[idx] > 0:
        duplicated_list[idx] = 1
    
    min_year_list.append(min_year)

In [None]:
df['min_year'] = min_year_list
df['duplicate'] = duplicated_list

In [None]:
df[df.song == 'honesty']

Unnamed: 0,song,year,artist,genre,lyrics,language,duplicate_idx,nlp_lyrics,min_year,duplicate
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[if, you, search, for, tenderness, it, isn, t,...",2009,1
8641,honesty,2013,billy-joel,Rock,If you search for tenderness\nIt isn't hard to...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[if, you, search, for, tenderness, it, isn, t,...",2009,1
33382,honesty,2013,editors,Rock,Don't you want me to stay?\nMake a rocking hor...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[don, t, you, want, me, to, stay, make, a, roc...",2013,0
63543,honesty,2005,alex-parks,Pop,Your face when I try to explain\nI didn't thin...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[your, face, when, i, try, to, explain, i, did...",2005,0
98959,honesty,2009,attack-attack,Rock,"Honesty didn't get me anywhere,\nI know cause ...",en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[honesty, didn, t, get, me, anywhere, i, know,...",2009,0
109173,honesty,2006,77s,Rock,Alright\nTell me what I said\nYou better watch...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[alright, tell, me, what, i, said, you, better...",2006,0
119239,honesty,2006,alannah-myles,Rock,"Oh, did I strike the chord of honesty in you?\...",en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[oh, did, i, strike, the, chord, of, honesty, ...",2006,0


In [None]:
df[df.song == 'slow-love']

Unnamed: 0,song,year,artist,genre,lyrics,language,duplicate_idx,nlp_lyrics,min_year,duplicate
8,slow-love,2009,beyonce-knowles,Pop,[Verse 1:]\nI read all of the magazines\nwhile...,en,"[8, 114832, 137184]","[verse, 1, i, read, all, of, the, magazines, w...",2009,1
114832,slow-love,2006,gino-vannelli,Pop,"I want some slow love, the kind that lasts for...",en,"[8, 114832, 137184]","[i, want, some, slow, love, the, kind, that, l...",2006,0
137184,slow-love,2009,brandy,Pop,[verse 1]\nI read of all the magazines while w...,en,"[8, 114832, 137184]","[verse, 1, i, read, of, all, the, magazines, w...",2009,1


We keep only one song (the oldest one) of each group of songs with the same title and the same lyrics

In [None]:
idx_to_drop = []

for idx, row in df.iterrows():
    if idx not in idx_to_drop:
        if row.year == row.min_year and row.duplicate == 1:
            for d in row.duplicate_idx:
                if d != idx and df.loc[d, 'duplicate'] == 1:
                    idx_to_drop.append(d)

In [None]:
df.drop(list(set(idx_to_drop)), inplace=True)

In [None]:
df[df.song == 'honesty']

Unnamed: 0,song,year,artist,genre,lyrics,language,duplicate_idx,nlp_lyrics,min_year,duplicate
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[if, you, search, for, tenderness, it, isn, t,...",2009,1
33382,honesty,2013,editors,Rock,Don't you want me to stay?\nMake a rocking hor...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[don, t, you, want, me, to, stay, make, a, roc...",2013,0
63543,honesty,2005,alex-parks,Pop,Your face when I try to explain\nI didn't thin...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[your, face, when, i, try, to, explain, i, did...",2005,0
98959,honesty,2009,attack-attack,Rock,"Honesty didn't get me anywhere,\nI know cause ...",en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[honesty, didn, t, get, me, anywhere, i, know,...",2009,0
109173,honesty,2006,77s,Rock,Alright\nTell me what I said\nYou better watch...,en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[alright, tell, me, what, i, said, you, better...",2006,0
119239,honesty,2006,alannah-myles,Rock,"Oh, did I strike the chord of honesty in you?\...",en,"[2, 8641, 33382, 63543, 98959, 109173, 119239]","[oh, did, i, strike, the, chord, of, honesty, ...",2006,0


In [None]:
df[df.song == 'slow-love']

Unnamed: 0,song,year,artist,genre,lyrics,language,duplicate_idx,nlp_lyrics,min_year,duplicate
8,slow-love,2009,beyonce-knowles,Pop,[Verse 1:]\nI read all of the magazines\nwhile...,en,"[8, 114832, 137184]","[verse, 1, i, read, all, of, the, magazines, w...",2009,1
114832,slow-love,2006,gino-vannelli,Pop,"I want some slow love, the kind that lasts for...",en,"[8, 114832, 137184]","[i, want, some, slow, love, the, kind, that, l...",2006,0


In [None]:
df = df.reset_index(drop=True)
df = df.drop(columns=['duplicate_idx', 'nlp_lyrics', 'min_year', 'duplicate'])
len(df)

189393

In [None]:
df.head()

Unnamed: 0,song,year,artist,genre,lyrics,language
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu...",en
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see...",en
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,en
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...",en
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po...",en


In [None]:
df.to_csv('./data/lyrics_def_noDupl.csv', index=False)