In [1]:
import json
import random
import os
import re
import pandas as pd

In [2]:
json_path = "./data/jsons/"

pattern_replace = r"\d+Embed"
pattern_split = r"\n*(\[.+\])\n*"
content = []

for filename in os.listdir(json_path):
    with open(json_path + filename, 'r') as f:
        songs = json.load(f)

    artist = filename.split('.')[0].replace('_', ' ')
    for song_name, song in songs.items():
        tag = ""
        song_clean = re.sub(pattern=pattern_replace, string=song, repl="")
        
        for part in re.split(pattern_split, song_clean):
            if re.match(pattern_split, part):
                tag = part
            else:
                content.append((artist, song_name, tag, part))
                tag = ""

In [3]:
df = pd.DataFrame(content, columns=["artist", "song", "tag", "content"])
df

Unnamed: 0,artist,song,tag,content
0,5 Seconds of Summer,Youngblood,,146 ContributorsTranslationsEspañolPortuguêsYo...
1,5 Seconds of Summer,Youngblood,[Verse 1: Luke],"Remember the words you told me\n""Love me till ..."
2,5 Seconds of Summer,Youngblood,[Pre-Chorus: Luke],You push and you push and I'm pullin' away\nPu...
3,5 Seconds of Summer,Youngblood,"[Chorus: Luke, All]","Youngblood\nSay you want me, say you want me o..."
4,5 Seconds of Summer,Youngblood,"[Verse 2: Luke, Luke & Ashton, All]","Lately, our conversations\nEnd like it's the l..."
...,...,...,...,...
10140,ZAYN,​wRoNg,"[Verse 2: Kehlani, Kehlani & ZAYN]",Here with you 'cause you got the right vibe\nS...
10141,ZAYN,​wRoNg,"[Chorus: Both, ZAYN, Kehlani]",You're looking in the wrong place for my love\...
10142,ZAYN,​wRoNg,[Refrain: ZAYN & Kehlani],"Don't stop what you're doin', what you're doin..."
10143,ZAYN,​wRoNg,[Bridge: ZAYN & Kehlani],"I don't, I don't really know (What you're doin..."


In [4]:
def clean(tag):
    pattern = r"\[(.+)\:"
    if ':' in tag:
        match = re.match(pattern, tag)
        if match:
            tag_ = match.group(1)
            return '[' + ''.join(filter(lambda x:not x.isdigit(), tag_)).strip() + ']'
    else:
        return ''.join(filter(lambda x:not x.isdigit(), tag)).strip()

In [5]:
df["tag_2"] = df["tag"].map(clean)
df = df[df["tag_2"]!=""]
df = df[df["content"]!=""]



df

Unnamed: 0,artist,song,tag,content,tag_2
1,5 Seconds of Summer,Youngblood,[Verse 1: Luke],"Remember the words you told me\n""Love me till ...",[Verse]
2,5 Seconds of Summer,Youngblood,[Pre-Chorus: Luke],You push and you push and I'm pullin' away\nPu...,[Pre-Chorus]
3,5 Seconds of Summer,Youngblood,"[Chorus: Luke, All]","Youngblood\nSay you want me, say you want me o...",[Chorus]
4,5 Seconds of Summer,Youngblood,"[Verse 2: Luke, Luke & Ashton, All]","Lately, our conversations\nEnd like it's the l...",[Verse]
5,5 Seconds of Summer,Youngblood,"[Chorus: Luke, All]","Youngblood\nSay you want me, say you want me o...",[Chorus]
...,...,...,...,...,...
10140,ZAYN,​wRoNg,"[Verse 2: Kehlani, Kehlani & ZAYN]",Here with you 'cause you got the right vibe\nS...,[Verse]
10141,ZAYN,​wRoNg,"[Chorus: Both, ZAYN, Kehlani]",You're looking in the wrong place for my love\...,[Chorus]
10142,ZAYN,​wRoNg,[Refrain: ZAYN & Kehlani],"Don't stop what you're doin', what you're doin...",[Refrain]
10143,ZAYN,​wRoNg,[Bridge: ZAYN & Kehlani],"I don't, I don't really know (What you're doin...",[Bridge]


In [15]:
df[["tag_2", "content"]].isnull().any()

tag_2      False
content    False
dtype: bool

In [6]:
df

Unnamed: 0,artist,song,tag,content,tag_2
1,5 Seconds of Summer,Youngblood,[Verse 1: Luke],"Remember the words you told me\n""Love me till ...",[Verse]
2,5 Seconds of Summer,Youngblood,[Pre-Chorus: Luke],You push and you push and I'm pullin' away\nPu...,[Pre-Chorus]
3,5 Seconds of Summer,Youngblood,"[Chorus: Luke, All]","Youngblood\nSay you want me, say you want me o...",[Chorus]
4,5 Seconds of Summer,Youngblood,"[Verse 2: Luke, Luke & Ashton, All]","Lately, our conversations\nEnd like it's the l...",[Verse]
5,5 Seconds of Summer,Youngblood,"[Chorus: Luke, All]","Youngblood\nSay you want me, say you want me o...",[Chorus]
...,...,...,...,...,...
10140,ZAYN,​wRoNg,"[Verse 2: Kehlani, Kehlani & ZAYN]",Here with you 'cause you got the right vibe\nS...,[Verse]
10141,ZAYN,​wRoNg,"[Chorus: Both, ZAYN, Kehlani]",You're looking in the wrong place for my love\...,[Chorus]
10142,ZAYN,​wRoNg,[Refrain: ZAYN & Kehlani],"Don't stop what you're doin', what you're doin...",[Refrain]
10143,ZAYN,​wRoNg,[Bridge: ZAYN & Kehlani],"I don't, I don't really know (What you're doin...",[Bridge]


In [7]:
test_songs = []

random.seed(0)

songs = df[["artist", "song"]].drop_duplicates()
for artist in songs["artist"].unique():
    test_songs.append(
        random.choice(
            songs[songs["artist"]==artist]["song"].array
    ))

test_songs[:10]

['Ghost of You',
 'Waterloo',
 'Bang!',
 'Boy In The Bubble',
 'Growing Pains',
 'Perfect',
 '\u200bbreak up with your girlfriend, i’m bored',
 'Not Your Barbie Girl',
 'No Place',
 'Oblivion']

In [8]:
from sklearn.model_selection import train_test_split

df_test = df[df["song"].isin(test_songs)]
df_train, df_val = train_test_split(df[~df["song"].isin(test_songs)], train_size=0.8)

In [9]:
df_train.sort_values(by=["artist", "song"]).to_csv("./data/datasets/train.csv", index=False)
df_test.sort_values(by=["artist", "song"]).to_csv("./data/datasets/test.csv", index=False)
df_val.sort_values(by=["artist", "song"]).to_csv("./data/datasets/val.csv", index=False)