<a href="https://colab.research.google.com/github/soroushmirzaei/projects-notebook-templates/blob/main/text-processing-templates/text-generation-template-notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import requirement libraries
import os
import shutil

#import dataset query libraries
import csv
import json

#import mathematics statics libraries
import random as rnd
import numpy as np

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning deep learning libraries
import tensorflow as tf
from tensorflow import keras


In [None]:
#download filters-characters dataset
!wget -q https://raw.githubusercontent.com/soroushmirzaei/text-processing-projects/main/english-language-filter-characters.txt
!wget -q https://raw.githubusercontent.com/soroushmirzaei/text-processing-projects/main/persian-language-filter-characters.txt

#download similar-characters dataset
!wget -q https://raw.githubusercontent.com/soroushmirzaei/text-processing-projects/main/persian-language-similar-characters.json


In [None]:
#define filters-list function loader
def filter_chars(file_path):
    filter_chars = list()
    with open(file_path, 'r') as filters_list_file:
        for word in filters_list_file:
            filter_chars.append(word.strip('\n'))
        filters_list_file.close()
    return filter_chars

#define similar-characters function loader
def similar_chars(file_path):
    with open(file_path, 'r') as similar_chars_file:
        similar_chars = json.load(similar_chars_file)
    return similar_chars


In [None]:
#load filters-characters
eng_filter_characters = filter_chars('english-language-filter-characters.txt')
per_filter_characters = filter_chars('persian-language-filter-characters.txt')

#load similar-characters
per_similar_characters = similar_chars('persian-language-similar-characters.json')


In [None]:
#define remove filters characters function
def remove_filter(text, filters_list):
    characters = list(text)
    characters_without_filters = [character for character in characters if character not in filters_list]
    text_without_filters = ''.join(characters_without_filters)
    return text_without_filters

#define similar characters modification function
def similar_char(text, similar_chars_dict):
    characters = list(text)
    similar_characters_modified_list = [similar_chars_dict.get(character,character) for character in characters]
    similar_characters_modified_text = ''.join(similar_characters_modified_list)
    return similar_characters_modified_text


In [None]:
#define tensorflow datasets texts labels loader
def tfds_text(#define dataset
              data_set,
              #define preprocessing function for texts
              use_filter_remover = False, filters_list = None
              ):
    
    #create empty texts labels list
    texts_list = list()

    for text in data_set['train']:
        text = text.numpy().decode('utf8')
        #optional modification function
        if use_filter_remover:
            text = remove_filter(text, filters_list)
        texts_list.append(text)

    return texts_list


In [None]:
#define texts and labels list loader for csv and json files
def texts_loader(#define file path and type
                 file_path, file_type,
                 #define csv and txt files index for text and labels
                 text_index = None, header_row = True, spliter_delimiter = None,
                 #define json file keys for texts and labels
                 text_key = None,
                 #define preprocessing function for texts
                 use_filter_remover = False, filters_list = None,
                 use_similarchars_modifier = False, similarchars_dict = None
                 ):
    
    #create empty texts labels list
    texts_list = list()

    #csv file loader
    if file_type in ['csv']:
        with open(file_path, 'r') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter = spliter_delimiter)
            if header_row:
                next(csv_reader)
            for row in csv_reader:
                text = row[text_index]
                #optional modification function
                if use_filter_remover:
                    text = remove_filter(text, filters_list)
                if use_similarchars_modifier:
                    text = similar_char(text, similarchars_dict)
                texts_list.append(text)
        csv_file.close()

    #txt file loader
    if file_type in ['txt']:
        with open(file_path, 'r') as txt_file:
            for line in txt_file:
                line = line.split(spliter_delimiter)
                text = line[text_index]
                #optional modification function
                if use_filter_remover:
                    text = remove_filter(text, filters_list)
                if use_similarchars_modifier:
                    text = similar_char(text, similarchars_dict)
                texts_list.append(text.strip('\n'))
        txt_file.close()
    
    #json file loader
    if file_type in ['json']:
        with open(file_path, 'r') as json_file:
            json_reader = json.load(json_file)
            for item in json_reader:
                text = item[text_key]
                #optional modification function
                if use_filter_remover:
                    text = remove_filter(text, filters_list)
                if use_similarchars_modifier:
                    text = similar_char(text, similarchars_dict)
                texts_list.append(text)
        json_file.close()

    return texts_list


In [None]:
#define tokenizer and sequences and padding sequences
def texts_labels_generator(#define texts
                           texts_list,
                           #define filter characters list
                           use_modified_filters = False, filters_list = None,
                           #define json tokenizer
                           save_tokenizer_json = False, tokenizer_filepath = None
                           ):
    
    #define tokenizer filters and fit on texts
    from keras.preprocessing.text import Tokenizer
    if use_modified_filters:
        filters = ''.join(filters_list)
    else:
        filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

    tokenizer = Tokenizer(filters = filters)
    tokenizer.fit_on_texts(texts_list)

    #define word_index
    word_index = tokenizer.word_index
    #for padding and counting out of vocab word
    total_words = len(word_index) + 1

    #save tokenizer json file
    if save_tokenizer_json:
        with open(tokenizer_filepath+'.json','w') as tokenizer_file:
            json.dump(tokenizer.to_json(), tokenizer_file)

    #define texts to sequences
    texts_sequences = tokenizer.texts_to_sequences(texts_list)

    #define phrase based sequences
    phrases_sequences = list()

    for text_sequence in texts_sequences:
        for token_iter in range(1, len(text_sequence)):
            phrase_sequence = text_sequence[:token_iter+1]
            phrases_sequences.append(phrase_sequence)

    #define maximum length of the sequences
    maxlen = max([len(sequence) for sequence in phrases_sequences])

    #define training validation pad sequences
    from keras.preprocessing.sequence import pad_sequences
    padded_sequences = pad_sequences(phrases_sequences, maxlen = maxlen, padding = 'pre')

    #split texts and labels
    texts = padded_sequences[:,:-1]
    labels = padded_sequences[:,-1]

    return texts, labels, maxlen, tokenizer, word_index


In [None]:
#define labels encoder
def label_encoder(#define labels list and method
                  labels_list,
                  #define method binary, ordinal or onehot
                  method, return_categories = True
                  ):
    
    #ordinal and binary encoder method
    if method in ['binary','ordinal']:
        unique_labels = sorted(list(set(labels_list)))
        labels_dict = {
            label : int(unique_labels.index(label)) for label in unique_labels
        }
        labels = list(map(lambda label : labels_dict[label], labels_list))
    
    #one-hot encoder method
    elif method in ['onehot']:
        unique_labels = sorted(list(set(labels_list)))
        labels_dict = {
            label : int(unique_labels.index(label)) for label in unique_labels
        }
        labels_encoded = list()
        for label in labels_list:
            label_encoded = len(unique_labels)*[0]
            label_number = labels_dict[label]
            label_encoded[label_number] = 1
            labels_encoded.append(label_encoded)
        labels = labels_encoded

    #convert list type to array
    labels_encoded = np.array(labels)
    
    if return_categories:
        return labels_encoded, labels_dict
    else:
        return labels_encoded
        

In [None]:
#define pre-trained words dictionary loader
def word_dict_loader(#define file path and file type
                     file_path, file_type,
                     #define txt and csv file type args
                     word_index = None, vector_index = None, header = True, spliter_delimiter = None,
                     use_word_spliter = False, word_spliter = None, word_split_index = None,
                     #define json file type args
                     word_key = None, vector_key = None,
                     ):
    
    word_dict = dict()

    #define txt vec loader
    if file_type in ['txt', 'vec']:
        with open(file_path, 'r') as word_dict_file:
            if header:
                next(word_dict_file)
            for row in word_dict_file:
                row = row.split(spliter_delimiter)
                if use_word_spliter:
                    word = row[word_index].split(word_spliter)[word_split_index]
                else:
                    word = row[word_index]
                vectors = np.array(row[vector_index:], dtype = 'float32')
                word_dict[word] = vectors

    #define csv loader
    elif file_type in ['csv']:
        with open(file_path, 'r') as word_dict_file:
            word_dict_file = csv.reader(word_dict_file, delimiter = spliter_delimiter)
            if header:
                next(word_dict_file)
            for row in word_dict_file:
                if use_word_spliter:
                    word = row[word_index].split(word_spliter)[word_split_index]
                else:
                    word = row[word_index]
                vectors = np.array(row[vector_index:], dtype = 'float32')
                word_dict[word] = vectors
                
    #define json loader
    elif file_type in ['json']:
        with open(file_path, 'r') as word_dict_file:
            word_dict_file = json.load(word_dict_file)
            for item in word_dict_file:
                word = item[word_key]
                vectors = np.array(item[vector_key], dtype = 'float32')
                word_dict[word] = vectors

    #word dict params
    word_dict_size = len(word_dict)
    word_dict_dim = list(word_dict.values())[0].shape[0]

    return word_dict, word_dict_size, word_dict_dim


In [None]:
#define pre-trained embedding word vectors
def embd_weights_loader(#define word dictionary and word index
                        word_dict, word_index, dimension
                        ):
    
    #create embedding weights
    embed_weights = np.zeros([len(word_index)+1, dimension])

    for word, index in word_index.items():
        if word in word_dict:
            embed_weights[index] = word_dict[word]

    #embedding layer params
    vocab_size = embed_weights.shape[0]
    embed_dim = embed_weights.shape[1]

    return embed_weights, vocab_size, embed_dim


In [None]:
#define model
def create_model(#define input shape
                 input_shape = None,
                 #define embedding layer parameters
                 use_pretraind_embd = False, vocab_size = None, embd_dim = None,
                 sequence_len = None, embed_weights = None,
                 #define type of layer and parameters
                 use_lstm = False, use_gru = False, use_conv = False,
                 #define lstm layers parameters
                 lstm_layers_num = None, lstm_layers_units = None,
                 #define gru layers parameters
                 gru_layers_num = None, gru_layers_units = None,
                 #define convolution layers parameters
                 conv_layers_num = None, conv_layers_filters = None, conv_layers_kernel = None,
                 #define convolution layers sub layers
                 use_max_pool = False, max_pool_size = None,
                 #define dense layer feeder
                 use_global_max_pool = False, use_global_avg_pool = False, use_flatten = False,
                 use_feeder_dropout = False, feeder_dropout_ratio = None,
                 #define dense head layers
                 use_dense_layers = False, dense_layers_num = None, dense_layers_units = None,
                 #define dense layers dropout parameters
                 use_dense_dropout = False, dense_dropout_ratio = None,
                 #define output layer parameters
                 output_layer_unit = None, output_layer_activation = None,
                 #define model compiler parameters
                 optimizer = None, loss = None, metrics = None
                 ):
    
    #define input layer
    input = keras.Input(shape = input_shape)

    #define embedding layer and parameters
    if use_pretraind_embd:
        out = keras.layers.Embedding(input_dim = vocab_size, output_dim = embd_dim, input_length = sequence_len,
                                     weights = [embed_weights], trainable = False)(input)
    else:
        out = keras.layers.Embedding(input_dim = vocab_size, output_dim = embd_dim, input_length = sequence_len)(input)

    #define type of layer and parameters
    #lstm type layers
    if use_lstm:
        sequence_return = (lstm_layers_num - 1)*[True]
        sequence_return.append(False)
        for layer_num in range(lstm_layers_num):
            out = keras.layers.Bidirectional(keras.layers.LSTM(lstm_layers_units[layer_num],
                                                               return_sequences = sequence_return[layer_num]))(out)

    #gru type layers
    elif use_gru:
        sequence_return = (gru_layers_num - 1)*[True]
        sequence_return.append(False)
        for layer_num in range(gru_layers_num):
            out = keras.layers.Bidirectional(keras.layers.GRU(gru_layers_units[layer_num],
                                                              return_sequences = sequence_return[layer_num]))(out)
    
    #convolution type layer
    elif use_conv:
        for layer_num in range(conv_layers_num):
            out = keras.layers.Conv1D(filters = conv_layers_filters[layer_num], kernel_size = conv_layers_kernel[layer_num],
                                      activation = 'relu')(out)
            if use_max_pool[layer_num]:
                out = keras.layers.MaxPool1D(max_pool_size[layer_num])(out)


    #dense layers feeder layer
    #global max pool type layer
    if use_global_max_pool:
        out = keras.layers.GlobalMaxPooling1D()(out)
        
    #global average pool type layer
    elif use_global_avg_pool:
        out = keras.layers.GlobalAveragePooling1D()(out)

    #flatten type layer
    elif use_flatten:
        out = keras.layers.Flatten()(out)

    #define feeder dropout layer
    if use_feeder_dropout:
        out = keras.layers.Dropout(feeder_dropout_ratio)(out)


    #define dense head layers
    if use_dense_layers:
        for layer_num in range(dense_layers_num):
            out = keras.layers.Dense(dense_layers_units[layer_num], activation = 'relu')(out)
            if use_dense_dropout[layer_num]:
                out = keras.layers.Dropout(dense_dropout_ratio[layer_num])(out)
    
    #define output layer
    output = keras.layers.Dense(output_layer_unit, activation = output_layer_activation)(out)

    #define model
    model = keras.models.Model(inputs = input, outputs = output)


    #compile model
    model.compile(optimizer = optimizer,
                  loss = loss,
                  metrics = metrics)
    
    return model
    

In [None]:
#plot model training loss
pd.DataFrame(model.history.history)[['loss']].plot(figsize = (9, 6), linewidth = 3)
plt.grid(linestyle = '--', linewidth = 2)
plt.show()


In [None]:
#plot model training accuracy
pd.DataFrame(model.history.history)[['accuracy']].plot(figsize = (9, 6), linewidth = 3)
plt.grid(linestyle = '--', linewidth = 2)
plt.ylim(0,1)
plt.show()


In [None]:
#model evaluation
train_set_eval = model.evaluate(texts, labels_encoded, verbose = 0)
print(f'Training Set Evaluation:\n\tLoss: {round(train_set_eval[0],4)}\tAccuracy: {100*round(train_set_eval[1],4)}%')


In [None]:
#define text generator based on model
def text_generator(#define text and numbert of iteration
                   input_text, iteration,
                   #define tokenizer and maxlen
                   tokenizer, maxlen, word_index,
                   #define labels categories and model
                   labels_dict, model
                   ):

    input_text = str(input_text)
    reversed_labels = dict([(key,value) for value, key in labels_dict.items()])
    index_word = dict([(key, value) for value, key in word_index.items()])

    for iter in range(iteration):
        text = tokenizer.texts_to_sequences([input_text])
        text = keras.preprocessing.sequence.pad_sequences(text, padding = 'pre', maxlen = maxlen - 1)
        predict = model.predict(text)
        predict = np.argmax(predict, axis = -1)[0]
        predict = reversed_labels[predict]
        predict = index_word[predict]
        input_text = input_text + " " + predict
        
    return input_text.title()


In [None]:
#save model
model.save('/content/model.h5')
