<a href="https://colab.research.google.com/github/southjohn64/ex3_dl/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
#https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from tensorboard import notebook
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.layers import Average, Dense, Embedding, Flatten, Input, LSTM,Bidirectional,Dropout,Activation
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.models import Model, Sequential
import json
import numpy as np
import os
import pandas as pd
#import pretty_midi
import tensorflow as tf
import time

In [15]:
# params
embd_size = 300
vocab_size = -1
seq_len = 5
batch_size = 8

In [16]:
def read_data(file_name):
    songs_df = pd.read_csv(file_name, sep='\n', header=None)
    #split to 3 cols by the ',' sign
    songs_df_clean = songs_df.iloc[:, 0].str.rstrip(r'&, ').str.extract(r'([^,]+),([^,]+),(.+)')
    songs_df_clean.columns = ['artist', 'title', 'lyrics']
    print('number of songs = {}'.format(songs_df_clean.shape[0]))
    return songs_df_clean
file_name_train = '/content/lyrics_train_set.csv'
df_songs_train = read_data(file_name_train)
file_name_test = '/content/lyrics_test_set.csv'
df_songs_test = read_data(file_name_test)

number of songs = 600
number of songs = 5


In [17]:
def clean_lyrics(df_songs):
    #df_songs['lyrics'] = df_songs.lyrics.str.replace('&', '\n') #in order to learn the break line
    df_songs['lyrics'] = df_songs.lyrics.str.lower()
    df_songs['lyrics_clean'] = df_songs.lyrics.str.replace('[,.?!)()]', '')
    return df_songs
df_songs_train = clean_lyrics(df_songs_train)

In [18]:
# smaple data
df_songs_train.head()

Unnamed: 0,artist,title,lyrics,lyrics_clean
0,elton john,candle in the wind,goodbye norma jean & though i never knew you a...,goodbye norma jean & though i never knew you a...
1,gerry rafferty,baker street,winding your way down on baker street & lite i...,winding your way down on baker street & lite i...
2,gerry rafferty,right down the line,you know i need your love & you've got that ho...,you know i need your love & you've got that ho...
3,2 unlimited,tribal dance,come on check it out ya'll & (come on come on!...,come on check it out ya'll & come on come on &...
4,2 unlimited,let the beat control your body,let the beat control your body & let the beat ...,let the beat control your body & let the beat ...


In [19]:
all_words = df_songs_train.lyrics_clean.str.cat()
corpus = all_words.split()
print('corpus size is {}'.format(len(corpus)))

corpus size is 182298


In [20]:
#all_words = df_songs_train.lyrics_clean.str.cat()
corpus = corpus[:1000]
print('corpus size is {}'.format(len(corpus)))

corpus size is 1000


In [21]:

emb_model = Word2Vec([corpus, ["UNK"]], min_count=1, size=embd_size)  # Handling unknown words
emb_model.save('./emb_model.bin')
print(emb_model)
vocab_size = len(emb_model.wv.vocab)

Word2Vec(vocab=292, size=300, alpha=0.025)


In [22]:
# prepare seq and next word
def prepare_sequences(corpus):
    sequences = []
    next_words = []
    len(corpus)
    for i in range(len(corpus)):
        if seq_len+i < len(corpus):
            seq = corpus[i:seq_len+i]
            sequences.append(seq)
            next_word = corpus[seq_len+i]
            next_words.append(next_word)
    return sequences,next_words
sequences_train,next_words_train = prepare_sequences(corpus)

In [23]:
# get the index of rach word from word2vec
# prepare list of lists with idx
 
emb_model.wv.index2word[0]
word2index = {word:idx for idx,word in enumerate(emb_model.wv.index2word)}
word2index['&']

0

In [24]:
sequences_idx_train = []
for seq in sequences_train:
  seq_idx = []
  for word in seq:
    word_idx = word2index[word]
    seq_idx.append(word_idx)
  sequences_idx_train.append(seq_idx)



In [37]:
next_words_idx_train = []
for next_word in next_words_train:
  next_word_idx = word2index[next_word]
  next_words_idx_train.append(next_word_idx)


In [38]:
print(sequences_train[0])
print(sequences_idx_train[0])

['goodbye', 'norma', 'jean', '&', 'though']
[58, 59, 60, 0, 88]


In [39]:
emb_mat = emb_model.wv.vectors

In [50]:

def get_model(vocab_size, embedding_dim, rnn_units, batch_size,dropout = 0.2):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=seq_len, weights=[emb_mat],trainable=False))
    #e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
    #make trainable false

    model.add(LSTM(rnn_units, return_sequences=True, stateful=False, recurrent_initializer='glorot_uniform'))
    model.add(Dense(rnn_units,activation='relu'))
    model.add(Flatten())
    model.add(Dense(vocab_size,activation='softmax'))
    return model

In [51]:
lstm_model = get_model(vocab_size,embd_size,10, batch_size)
lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 5, 300)            87600     
_________________________________________________________________
lstm_2 (LSTM)                (None, 5, 10)             12440     
_________________________________________________________________
dense_4 (Dense)              (None, 5, 10)             110       
_________________________________________________________________
flatten (Flatten)            (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 292)               14892     
Total params: 115,042
Trainable params: 27,442
Non-trainable params: 87,600
_________________________________________________________________


In [52]:
#lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [43]:
next_words_train_one_hot = to_categorical(y= next_words_idx_train, num_classes=vocab_size, dtype='int')



In [44]:
sequences_idx_train_np = np.array(sequences_idx_train)

In [54]:
history = lstm_model.fit(x=sequences_idx_train_np,y=next_words_train_one_hot,batch_size=batch_size,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
 12/125 [=>............................] - ETA: 0s - loss: 3.9114 - accuracy: 0.1875

KeyboardInterrupt: ignored