In [2]:
import re
import csv
import time
import nltk
import numpy as np 
import requests
import tensorflow as tf
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

gpus = tf.config.list_physical_devices('GPU')
gpu = gpus[0]
tf.config.experimental.set_memory_growth(gpu, True)

#### Importing Repository

In [11]:
# !pip install git+https://github.com/raudez77/The-art-of-state/tree/main
import Crawler_Canserbero
import Pre_process

#### Other Functions

In [20]:
def char_idx(c):
    if c in chars:
        return char2idx[c]
    return char2idx[UNK]

def AICanserbero (Vocab_size,EMD,Batch_size,l1_units,l2_units):
    model= tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(Vocab_size,EMD, mask_zero=True,
                                batch_input_shape = [Batch_size,None]))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.GRU(l1_units, return_sequences = True,
                                  stateful = True,recurrent_initializer='glorot_uniform'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.GRU(l2_units, return_sequences = True,
                                  stateful = True,recurrent_initializer='glorot_uniform'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(Vocab_size))

    return model

def generate_text (model_ , star_string, temperature =0.7, num_generate = 50):
    # Converting string to numbers
    input_eval = [char2idx[s] for s in star_string]
    input_eval = tf.expand_dims(input_eval,0) #Onehot Econder


    # Text Generated
    text_generated = []
    
    # Batch
    for i in range(num_generate):
        predictions = model_(input_eval)
        predictions = tf.squeeze(predictions,0)
        
        # Using a categorical distribution 
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
        
    return (star_string+ ''.join(text_generated))

#### Crawler

In [13]:
url = "https://www.letras.com/canserbero/"
Canser = Crawler_Canserbero(url)
song = Canser._song_by_song()

#### Pre-Processing

In [None]:
path = 'yourpath'
Preprocess = Pre_process(path)
all_songs = Preprocess.Clean()
print("This is sample including in the canserbero_song.txt")
all_songs.split(",")[6:11]

#### Text Processing

1.   Create list of unique character in spanish we have accents and special word such as "ñ"
2.   Define token such as EOS "End of Sentence", PAD "Padding Sentences"
3. Use encoding "utf-8" for special character 
4. Define Max length for the sequences 

In [None]:
#Tokenization
chars = sorted(set("""áéèíóúüabcdefghijklmnñopqrstuvwxyz0123456789 -,;.[¡¿!?]:“'’’’/\|_@#$%ˆ&*˜‘+-=()[]{}"'ÁÉÍÓÚABCDEFGHIJKLMNÑOPQRSTUVWXYZ"""))
chars = list(chars)
EOS = '<EOS>' #End of the sentences 
UNK = "<UNK>" #unknowd Character
PAD = "<PAD>" #Padding
chars.append(UNK)
chars.append(EOS)  #end of sentence
chars.insert(0, PAD)  # now padding should get index of 0



1.   Create a dictionary with unique characters and give them a unique number
2.   Load the data set defined by the Max_length "The number of character per sentences "



In [None]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(chars)}
idx2char = np.array(chars)

data = []
MAX_LEN = 75 #Value that repeat the most is  45  
with open (path , "r", encoding="utf-8") as file:
    lines = csv.reader(file, delimiter = '\t')
    for line in lines:
        sln = line[0]
        covrtd = [char_idx(c) for c in sln[:-1]]
        if len(covrtd) >= MAX_LEN:
            covrtd = covrtd[0:MAX_LEN-1]
            covrtd.append(char2idx[EOS]) #EOS End of the line
        else:
            covrtd.append(char2idx[EOS])
            #adding padding
            remain = MAX_LEN - len(covrtd)
            if remain > 0 :
                for i in range(remain):
                    covrtd.append(char2idx[PAD])
        data.append(covrtd)
print("**** Data file loaded ****")
print(f"This an one sentences,'{''.join([idx2char[i] for i in data[0][:20]])}'")
print("This is the same sentences but already tokenized",[i for i in  data[0][:20]])

In [None]:
# Converting to numpy 
np_data = np.array(data)

# Training and Testing 
np_data_in, np_data_out  = np_data[:,:-1] , np_data[:, 1:]

# Creating Tensor
X = tf.data.Dataset.from_tensor_slices((np_data_in,np_data_out))


# Settings
Vocab_size = len(chars)
EMD = 64
RRN_unit = 1024
BATCH_SIZE = 128

# Creating data set 
X_train = X.shuffle(1000, reshuffle_each_iteration=True).\
batch(BATCH_SIZE, drop_remainder = True)

In [None]:
# Settings
Vocab_size = len(chars)
EMD = 64
l1_units = 1024
l2_units = 560
BATCH_SIZE = 128

model = AICanserbero (Vocab_size,EMD,BATCH_SIZE,l1_units,l2_units)
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer = 'adam', loss = loss)
model.summary()

The model was train with a batch_size of `[128,64]` , So now if I want to generate some text I should pass 128 rows but to avoid that:
1. I will extact the weights
2. Initiate a new model with same parameter but the input will be 1,None which means one line whith N number of character
3. Set the Weights using the old model weight
4. Predict new song in a foor lop. 

In [None]:
old_model_Weigths = model.get_weights()
AI_CANSERBERO = AICanserbero(Vocab_size,EMD,1,l1_units,l2_units)
AI_CANSERBERO.set_weights(old_model_Weigths)
AI_CANSERBERO.build(tf.TensorShape([1,None]))


new_lyrics = generate_text(AI_CANSERBERO , star_string = 'con un pueblo culto',temperature =0.8,
                   num_generate =10)
new_lyrics