In [1]:
import nbimporter
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras import backend as K
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D , InputSpec ,InputSpec
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
import re
from gensim.models import KeyedVectors
import pandas as pd
from sklearn.metrics import classification_report
from keras_self_attention import SeqSelfAttention
from keras.models import load_model
from itertools import permutations 
import keras
import ast
from keras.utils import np_utils
to_one_hot = np_utils.to_categorical

Using TensorFlow backend.


# Load data

In [2]:
df_train = pd.read_csv("vagalume.train.csv")
df_dev = pd.read_csv("vagalume.dev.csv")

# Preprocess Data

In [3]:
def clean_text(text):
    text = str(text)
    text = re.sub("\n"," ",text)
    text = re.sub("[,|!|\?|\.]"," ",text)
    text = re.sub(" +"," ",text)
    
    return text.lower()


def get_X_Y(dataframe):
    
    X = []
    Y = []
    
    for row,line in dataframe.iterrows():
        
        clean_title = clean_text(line['music_title'])
        clean_lyric = clean_text(line['music_lyric'])
        X.append((clean_title+" "+clean_lyric).split(" ") )
        Y.append(line['genre'])
    
    return X,Y
    
    
    

In [4]:
X_train,Y_train = get_X_Y(df_train)
X_dev,Y_dev = get_X_Y(df_train)

# Prepare the tokens

In [5]:
EMBEDDING_DIM = 100

words_data_set = set([w.lower() for sentece in X_train for w in sentece ])
model_word2vec = KeyedVectors.load_word2vec_format("word_embeddings/wang2vec/cbow_s100.txt", unicode_errors="ignore")
words_word2vec = set(model_word2vec.vocab.keys())

words = list( words_data_set.union(words_word2vec) )
n_words = len(words)
print("words len:",n_words)

words len: 996773


In [6]:
tags_lyric = list(set( [tag for tag in (Y_dev+Y_train) ]  )  )
n_tags_lyric = len(tags_lyric)
print("tags len:",n_tags_lyric)

tags len: 14


In [7]:
max_len = 290
max_len_char = 20

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags_lyric)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

# Word pad

In [8]:
def word_pad(senteces_param):
    
    word_pad = []
    for s in senteces_param:
        sentece_pad = []
        for w in s:
            if w.lower() in word2idx:
                sentece_pad.append( word2idx[w.lower()])
            else:
                sentece_pad.append(1)
        word_pad.append(sentece_pad)
    
    word_pad = pad_sequences(maxlen=max_len, sequences=word_pad, value=word2idx["PAD"], padding='post', truncating='post')
    
    return word_pad

In [9]:
X_word_tr = word_pad(X_train)
X_word_dv = word_pad(X_dev)

# Tag pad

In [10]:
def y_pad(y_param):
    return [tag2idx[tag] for tag in y_param]

In [11]:
y_train = y_pad(Y_train)
y_test = y_pad(Y_dev)

# Model 

In [12]:
# Load Embedding Matrix
embedding_matrix = np.random.random((n_words + 2, EMBEDDING_DIM))
for word, i in word2idx.items():
    if(word in model_word2vec):
        embedding_matrix[i] = model_word2vec[word]

In [13]:
hidden_layers=256

#Model
# input and embedding for words
word_in = Input(shape=(max_len,))
emb_word = Embedding(n_words + 2, EMBEDDING_DIM,
                     weights=[embedding_matrix],input_length=max_len, mask_zero=True)(word_in)

#BLSTM
x = SpatialDropout1D(0.3)(emb_word)
lstm = Bidirectional(LSTM(units=hidden_layers, return_sequences=False,
                               recurrent_dropout=0.6))(x)

out = Dense(n_tags_lyric+1, activation="softmax")(lstm)

model = Model(word_in, out)

#Compile
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 290)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 290, 100)          99677500  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 290, 100)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               731136    
_________________________________________________________________
dense_1 (Dense)              (None, 15)                7695      
Total params: 100,416,331
Trainable params: 100,416,331
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_word_tr, y_train,validation_data=( X_word_dv, y_test ), epochs=5, verbose=1)

Train on 96857 samples, validate on 96857 samples
Epoch 1/5