In [58]:
import pandas as pd
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

In [2]:
lyrics = pd.read_csv('./data/lyrics-data.csv').rename(columns={'ALink': 'Link'})
lyrics.head()

Unnamed: 0,Link,SName,SLink,Lyric,Idiom
0,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH
1,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH
2,/10000-maniacs/,These Are Days,/10000-maniacs/these-are-days.html,These are. These are days you'll remember. Nev...,ENGLISH
3,/10000-maniacs/,A Campfire Song,/10000-maniacs/a-campfire-song.html,"A lie to say, ""O my mountain has coal veins an...",ENGLISH
4,/10000-maniacs/,Everyday Is Like Sunday,/10000-maniacs/everyday-is-like-sunday.html,Trudging slowly over wet sand. Back to the ben...,ENGLISH


In [3]:
artists = pd.read_csv('./data/artists-data.csv')[['Link', 'Genre']]
artists.head()

Unnamed: 0,Link,Genre
0,/10000-maniacs/,Rock
1,/12-stones/,Rock
2,/311/,Rock
3,/4-non-blondes/,Rock
4,/a-cruz-esta-vazia/,Rock


In [4]:
df = pd.merge(lyrics, artists, on='Link')
df.head()

Unnamed: 0,Link,SName,SLink,Lyric,Idiom,Genre
0,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH,Rock
1,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH,Pop
2,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH,Rock
3,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH,Pop
4,/10000-maniacs/,These Are Days,/10000-maniacs/these-are-days.html,These are. These are days you'll remember. Nev...,ENGLISH,Rock


In [5]:
df = df[['Lyric', 'Idiom', 'Genre']]

In [6]:
len(df)

227513

In [7]:
df = df[df['Idiom'] == 'ENGLISH']

In [8]:
len(df)

124230

In [9]:
df['Genre'].value_counts()

Rock            60585
Pop             40294
Hip Hop         23108
Funk Carioca      104
Sertanejo          87
Samba              52
Name: Genre, dtype: int64

In [10]:
df = df[df['Genre'].isin(['Rock', 'Pop', 'Hip Hop'])]

In [11]:
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [12]:
def lyric_preprocessor(lyric):
    lyric = lyric.lower()
    
    lyric = [word for word in lyric.split() if word not in stopwords]
    lyric = ' '.join(map(str, lyric))
    
    lyric = lyric.translate(str.maketrans('', '', string.punctuation))
    
    return lyric

In [13]:
df['Lyric'] = df['Lyric'].apply(lyric_preprocessor)

In [16]:
df = df[['Lyric', 'Genre']]

In [19]:
n_words = 40000
sequence_len = 150
embedding_dim = 100

tokenizer = Tokenizer(num_words=n_words, lower=True)
tokenizer.fit_on_texts(df['Lyric'].values)

In [23]:
word_index = tokenizer.word_index
len(word_index)

158607

In [24]:
features = tokenizer.texts_to_sequences(df['Lyric'].values)

In [30]:
len(features[0]), len(features[100])

(73, 161)

In [32]:
features = pad_sequences(features, maxlen=sequence_len)

In [38]:
Y = pd.get_dummies(df['Genre']).values
X_train, X_test, Y_train, Y_test = train_test_split(features, Y, test_size=0.1, random_state=1)

In [42]:
len(X_train), len(X_test)

(111588, 12399)

In [50]:
len(features[0])

150

In [67]:
dropout = 0.2
mem_units = sequence_len
Y_shape = len(Y[0])

model = Sequential()
model.add(Embedding(n_words, embedding_dim, input_length=sequence_len))
model.add(SpatialDropout1D(dropout))
model.add(LSTM(mem_units, dropout=dropout, recurrent_dropout=dropout))
model.add(Dense(Y_shape, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [68]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                   validation_split=0.1, 
                   callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Train on 100429 samples, validate on 11159 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
