In [3]:
import load_dataset
import network
import sys
import numpy as np
import keras as ks
from keras.utils import np_utils
import time

In [4]:
lyrics = load_dataset.load_dataset()

Loading dataset...
Done.
Extracting relevant data...
Removing invalid characters...
Done.


In [5]:
#lyrics.info()
genre_counts = lyrics["genre"].value_counts()
# Try to find an equal number of samples per genre,
# otherwise the result is always rock
sub_data_sets={}
for g in lyrics["genre"].unique():
    sub_data_sets[g] = lyrics[lyrics["genre"]==g]
    print("Genre {} has {} lyrics.".format(g, np.sum(lyrics["genre"]==g)))
min_lyrics_per_genre = 5935

tmp=None

for g in sub_data_sets:
    if len(sub_data_sets[g]) < min_lyrics_per_genre:
        print("Skipped genre {} due to insufficient data".format(g))
        continue
    if tmp is None:
        tmp = sub_data_sets[g][:min_lyrics_per_genre]
    else:
        tmp = tmp.append(sub_data_sets[g][:min_lyrics_per_genre])
        
lyrics = tmp.sample(frac=1)
lyrics.info()

Genre Pop has 49444 lyrics.
Genre Hip-Hop has 33965 lyrics.
Genre Rock has 131377 lyrics.
Genre Metal has 28408 lyrics.
Genre Country has 17286 lyrics.
Genre Jazz has 17147 lyrics.
Genre Electronic has 16205 lyrics.
Genre Folk has 3241 lyrics.
Genre R&B has 5935 lyrics.
Genre Indie has 5732 lyrics.
Skipped genre Folk due to insufficient data
Skipped genre Indie due to insufficient data
<class 'pandas.core.frame.DataFrame'>
Int64Index: 47480 entries, 233355 to 23243
Data columns (total 6 columns):
index     47480 non-null int64
song      47480 non-null object
year      47480 non-null int64
artist    47480 non-null object
genre     47480 non-null object
lyrics    47480 non-null object
dtypes: int64(2), object(4)
memory usage: 2.5+ MB


In [6]:
max_seq_length=100
max_seq_count=1000000
(tokenizer, data_input_train, data_labels_train, data_input_test, data_labels_test, label_classes_to_index) = load_dataset.preprocess_dataset(lyrics, max_seq_length, max_seq_count)
index_to_label_class = {v: k for k, v in label_classes_to_index.items()}

Tokenize dataset...
Done.
Loaded 47480 documents.
Convert docs to sequences of length 100...
Done.
Generated 945739 sequences from 27751 documents.
Skipped 19729 docs.


In [11]:
embeddings_vec_size = 32
embeddings_matrix = []

In [None]:
#(embeddings_words, embeddings_vec_size) = load_dataset.load_embeddings()
#(embeddings_matrix, idx_to_word_map) = load_dataset.glove_to_matrix(embeddings_words, tokenizer)

In [7]:
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size:" , vocab_size)
print("Inputs Shape:" , data_input_train.shape)
print("Labels Shape:" , data_labels_train.shape)
print("Inputs Shape:" , data_input_test.shape)
print("Labels Shape:" , data_labels_test.shape)
print("Classes are:")
print(label_classes_to_index.keys())

Vocabulary Size: 116733
Inputs Shape: (851165, 100)
Labels Shape: (851165, 8)
Inputs Shape: (94574, 100)
Labels Shape: (94574, 8)
Classes are:
dict_keys(['Metal', 'Rock', 'Pop', 'Country', 'Jazz', 'Electronic', 'R&B', 'Hip-Hop'])


In [16]:
name = "new_20_custom_embedd_32_subset_1000000_words_100"
model = network.get_network(max_seq_length, data_labels_train.shape[1], embeddings_matrix, tokenizer, embeddings_vec_size)
model.summary()

Building network...
Done.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 32)           3735456   
_________________________________________________________________
lstm_2 (LSTM)                (None, 20)                4240      
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 168       
Total params: 3,739,864
Trainable params: 4,408
Non-trainable params: 3,735,456
_________________________________________________________________


In [None]:

callbacks = [
    ks.callbacks.TensorBoard("./logs/{}".format(name), write_graph=True, write_grads=False),
    ks.callbacks.ModelCheckpoint("./models/{}".format(name + ".dat"), save_best_only=True)
]

# serialize model to JSON
model_json = model.to_json()
with open("./models/{}".format(name + ".json"), "w") as json_file:
    json_file.write(model_json)
    
model.fit(data_input_train, data_labels_train, epochs=15, batch_size=2048, validation_data=(data_input_test, data_labels_test), callbacks=callbacks)

Train on 851165 samples, validate on 94574 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15

In [15]:
# load json and create model
json_file = open("./models/{}".format(name + ".json"), 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = ks.models.model_from_json(loaded_model_json)

model.load_weights("./models/{}".format(name + ".dat"))
model.summary()

ValueError: Dimension 1 in both shapes must be equal, but are 80 and 160. Shapes are [32,80] and [32,160]. for 'Assign_2' (op: 'Assign') with input shapes: [32,80], [32,160].

In [None]:
# generate characters
for i in range(10):
    pattern_idx = np.random.randint(0, data_input_test.shape[0]-1)
    pattern = data_input_test[pattern_idx,:]
    print("")
    print("Input:")
    print(load_dataset.idx_vec_to_string(tokenizer.index_word, pattern))

    x = np.reshape(pattern, (1, pattern.shape[0]))

    prediction = np.squeeze(model.predict(x, verbose=0))
    for j in range(3):
        idx = np.argmax(prediction)
        print("Predicted class {} with probability {}: {}".format(j+1, prediction[idx], index_to_label_class[idx]))
        prediction[idx] = 0
        
    print("Actual class: {}".format(index_to_label_class[np.argmax(data_labels_test[pattern_idx])]))

In [None]:
predictions = model.predict(data_input_test, verbose=0, batch_size=2048)

In [None]:
classifications = np.argmax(predictions, axis=1)
classifications_correct = np.argmax(data_labels_test, axis=1)
is_correct = classifications==classifications_correct
for i in range(0,7):
    acc = np.mean(is_correct[classifications_correct == i].astype(int))
    print("Per Class accuracy for {} is {}".format(index_to_label_class[i],acc))