# Main Goal:
The goal of this file is to split the dataset into a train/dev/test set. For now, we will do a split of 70/10/20. Also, in the future, it is possible that we will just do k-fold cross validation. In that case, the train and dev sets will be combined.

In [1]:
# mount google drive
from google.colab import drive
import os

drive.mount('/content/drive/')
os.chdir('/content/drive/Shareddrives/CS260-Project/data/')

Mounted at /content/drive/


In [4]:
# open the csv containing all the example pairs
# we will disregard genres and only focus on artist, lyric pair
import csv

compiled_dataset = []
X = []
Y = []
with open('./big-kaggle-dataset.csv') as datasetfile:
  line = 1
  reader = csv.reader(datasetfile, delimiter=",")
  for row in reader:
    if line == 1:
      print(row)
    else:
      artist = row[0]
      lyrics = row[1]
      X.append(artist)
      Y.append(lyrics)
      compiled_dataset.append([artist, lyrics])
    line += 1
print(compiled_dataset[0])

['artist', 'lyrics']
['ivete sangalo', "i feel so unsure as i take your hand and lead you to the dance floor as the music dies, something in your eyes calls to mind a silver screen and all those sad goodbyes  i'm never gonna dance again guilty feet have got no rhythm though it's easy to pretend i know you're not a fool  should've known better than to cheat a friend and waste the chance that i've been given so i'm never gonna dance again the way i danced with you  time can never mend the careless whispers of a good friend to the heart and mind ignorance is kind there's no comfort in the truth pain is all you'll find  i'm never gonna dance again guilty feet have got no rhythm though it's easy to pretend i know you're not a fool  should've known better than to cheat a friend and waste this chance that i've been given so i'm never gonna dance again the way i danced with you  never without your love  tonight the music seems so loud i wish that we could lose this crowd maybe it's better this

In [5]:
# sanity check: these should all be the same value

print(len(X))
print(len(Y))
print(len(compiled_dataset))

print(X[0])

225542
225542
225542
ivete sangalo


In [6]:
# stratified samplied requires that each artist have at least two examples (since we're splitting into two sets). So, we will remove the ones that don't meet this
# requirement before running it through the function call
artist_count_dict = {}
for artist in X:
  if artist in artist_count_dict:
    artist_count_dict[artist] += 1
  else:
    artist_count_dict[artist] = 1

del_artists = []
for artist, count in artist_count_dict.items():
  if count == 1:
    print(artist)
    del_artists.append(artist)

print(len(del_artists))

jammil e uma noites
luiz caldas
banda grafith
banda beijo
thalles roberto
seu jorge
baianasystem
raiz coral
negra li
banda morfina
nara leão
roberto menescal
mozart
josé & josué
thaeme e thiago
marília dutra
norman blake
west rocky
stromae
pabllo vittar
dulce maría
larissa manoela
milena stepanienco
dj pv
ale porto
malik mustache
dj ralk
hevo84
hori
agnela
santanna, o cantador
gatinha manhosa
bia socek
toni tornado
ludmilla
furacão 2000
mr. catra
buchecha
pancadão do caldeirão do huck
canção nova
fernanda brum
gabriela rocha
eyshila
mattos nascimento
soraya moraes
j. neto
sérgio lopes
regis danese
elaine de jesus
adriana arydes
suellen lima
kemuel
celina borges
vida reluz
prisma brasil
raquel mello
pg
voices
lydia moises
fernanda lara
jeanne mascarenhas
davi silva
rede ativa
mylla karvalho
melosweet
italo villar
sebhasttião alves
praise machine
laressa abreu
bem da hora
banda alien
cpm 22
dona mag
d.f.c.
let's go
gabriel o pensador
matuê
karol conka
starboi3
rylo rodriguez
marina sena


In [7]:
# dataset that removes the artists from the previous column
#import numpy as np
modified_X = []
modified_Y = []

for i in range(len(compiled_dataset)):
  artist = compiled_dataset[i][0]
  if artist not in del_artists:
    modified_X.append(compiled_dataset[i][0])
    modified_Y.append(compiled_dataset[i][1])


print(len(modified_X))
print(modified_X[50])
print(len(modified_Y))

223390
beyoncé
223390


In [12]:
# perform stratified sampling so we get an even-ish represention of each object we want for each set (in our case, we will split by artist)
# in other words, we want each artist represented evenly in the dataset
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

#modified_X = np.asarray(modified_X)
#modified_Y = np.asarray(modified_Y)

print(len(modified_X))
print(len(modified_Y))

# first split between train/test
first_split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
X_train, X_test_val, X_test, X_val, Y_train, Y_test_val, Y_test, Y_val = [], [], [], [], [], [], [], []
for train_idx, test_val_idx in first_split.split(modified_Y, modified_X):
  # sanity check
  print(train_idx.shape)
  print(test_val_idx.shape)
  #print(train_idx.intersection(test_idx))
  #X_train, X_test = modified_X[train_idx], modified_X[test_idx]
  #Y_train, Y_test = modified_Y[train_idx], modified_Y[test_idx]
  for i in train_idx:
    X_train.append(modified_X[i])
    Y_train.append(modified_Y[i])
  for j in test_val_idx:
    X_test_val.append(modified_X[j])
    Y_test_val.append(modified_Y[j])

# second split between test/val
second_split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=1)
for val_idx, test_idx in second_split.split(Y_test_val, X_test_val):
  for i in train_idx:
    X_val.append(X_test_val[i])
    Y_val.append(Y_test_val[i])
  for j in test_val_idx:
    X_test.append(X_test_val[j])
    Y_test.append(Y_test_val[j])

# print(X_train[0])
# print(Y_train[0])
# print(X_train[100])
# print(Y_train[100])
# print(X_test[100])
# print(Y_test[100])
# print(X_test[-1])
# print(Y_test[-1])

print(len(X_train))
print(len(Y_train))
print(len(X_test))
print(len(Y_test))
print(len(X_val))
print(len(Y_val))

223390
223390
(111695,)
(111695,)


ValueError: ignored

In [None]:
# add the miscellaneous samples from earlier into the training sample
"""
print(len(compiled_dataset))
print(del_artists)
for i, artist in enumerate(X):
  if artist in del_artists:
    print(artist)
    np.append(X_train, X[i])
    Y_train.append(Y[i])
print(len(X_train))
print(len(Y_train))
print(X_train[X_train.shape[0]-1])
print(Y_train[-1])
"""

'\nprint(len(compiled_dataset))\nprint(del_artists)\nfor i, artist in enumerate(X):\n  if artist in del_artists:\n    print(artist)\n    np.append(X_train, X[i])\n    Y_train.append(Y[i])\nprint(len(X_train))\nprint(len(Y_train))\nprint(X_train[X_train.shape[0]-1])\nprint(Y_train[-1])\n'

In [None]:
with open('./train/big-kaggle-train.csv', 'w') as trainfile:
  writer = csv.writer(trainfile)
  writer.writerow(['x', 'y'])
  for i in range(len(X_train)):
    writer.writerow([X_train[i], Y_train[i]])

with open('./test/big-kaggle-test.csv', 'w') as testfile:
  writer = csv.writer(testfile)
  writer.writerow(['x', 'y'])
  for i in range(len(X_test)):
    writer.writerow([X_test[i], Y_test[i]])

with open('./val/big-kaggle-val.csv', 'w') as testfile:
  writer = csv.writer(testfile)
  writer.writerow(['x', 'y'])
  for i in range(len(X_val)):
    writer.writerow([X_val[i], Y_val[i]])