In [21]:
import re
import tensorflow as tf
import tensorflow_hub as hub
import os
import random
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [22]:
# Constants
group_code = "santiagomartinez_201533279_camilocastaneda_202314092"
segment_length = 200

In [23]:
# Function to preprocess text
def processing_text(texto):
    # Paso 1: Remover con un expresión regular carateres especiales (no palabras).
    processed_feature = re.sub(r'\W', ' ', str(texto))
    # Paso 2: Remover ocurrencias de caracteres individuales
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    # Paso 3: Remover números (Ocurrencias muy esporádicas en nuestro dataset)
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature)
    # Paso 4: Simplificar espacios concecutivos a un único espacio entre palabras
    processed_feature = re.sub(' +', ' ', processed_feature)
    # Paso 5: Pasar todo el texto a minúsculas
    processed_feature = processed_feature.lower()

    return processed_feature

In [24]:
# Load and preprocess the text data
def load_and_preprocess_text_data(folder_path, segment_length):
    texts = []
    labels = []

    for author in authors:
        author_folder = os.path.join(folder_path, author)
        for filename in os.listdir(author_folder):
            with open(os.path.join(author_folder, filename), "r", encoding="utf-8") as file:
                text = file.read()
                # Process the text
                processed_text = processing_text(text)
                # Split text into segments if needed
                sequences = [processed_text[i:i+segment_length] for i in range(0, len(processed_text), segment_length)]
                texts.extend(sequences)
                labels.extend([author] * len(sequences))

    return texts, labels

# Read and preprocess the text data
def get_corpus(folder_path):
    texts = []
    invalid_txt = ['']
    
    for root, _, files in os.walk(folder_path):
        for file_ in files:
            with open(os.path.join(root, file_), "r", encoding="utf-8") as f:
                for line in f:
                    pre_processed_text = processing_text(line)
                    if pre_processed_text not in invalid_txt:
                        texts.append(pre_processed_text)
    
    return texts

In [25]:
# Get authors
authors = os.listdir("book_datasets")

# Initialize and fit the Keras tokenizer
tokenizer = Tokenizer()
corpus = get_corpus("book_datasets")
corpus = [sentence for sentence in corpus if sentence!=' ' and sentence!='' and len(sentence.split())>3]
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1
print("vocab size: " + str(vocab_size))
# Load and preprocess the text data
texts, labels = load_and_preprocess_text_data("book_datasets", segment_length)

vocab size: 20628


In [26]:
# Encode labels and one-hot encode
label_encoder = LabelEncoder()
label_encoder.fit(authors)
labels_encoded = label_encoder.transform(labels)
labels_one_hot = to_categorical(labels_encoded, num_classes=len(authors))


In [27]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels_one_hot, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)

In [28]:
# Convert the text data into numpy arrays
train_texts = np.array(train_texts)
train_labels = np.array(train_labels)
val_texts = np.array(val_texts)
val_labels = np.array(val_labels)
test_texts = np.array(test_texts)
test_labels = np.array(test_labels)

In [30]:
# Load the pre-trained Word2Vec model from TensorFlow Hub
embedding_model_url = "https://tfhub.dev/google/Wiki-words-500/2"
hub_layer = hub.KerasLayer(embedding_model_url, input_shape=[], dtype=tf.string, trainable=False)

# Define your feed forward neural network
model = tf.keras.Sequential([
    hub_layer,  # The Word2Vec embedding layer
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(authors), activation='softmax')
])

model.summary()
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 500)               504687500 
                                                                 
 dense_3 (Dense)             (None, 128)               64128     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dropout_4 (Dropout)         (None, 64)               

In [31]:
# Train the model
history = model.fit(
    train_texts, train_labels,
    epochs=100,  # You can adjust the number of epochs
    batch_size=64,
    validation_data=(val_texts, val_labels)
)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(test_texts, test_labels)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [36]:
# Load the pre-trained Word2Vec model from TensorFlow Hub
embedding_model_url = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding_model_url, input_shape=[], dtype=tf.string, trainable=True)

# Define your feed forward neural network
model = tf.keras.Sequential([
    hub_layer,  # The Word2Vec embedding layer
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(authors), activation='softmax')
])

model.summary()
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_3 (KerasLayer)  (None, 20)                400020    
                                                                 
 dense_15 (Dense)            (None, 128)               2688      
                                                                 
 dropout_11 (Dropout)        (None, 128)               0         
                                                                 
 dense_16 (Dense)            (None, 64)                8256      
                                                                 
 dropout_12 (Dropout)        (None, 64)                0         
                                                                 
 dense_17 (Dense)            (None, 3)                 195       
                                                                 
Total params: 411159 (1.57 MB)
Trainable params: 41115

In [37]:
# Train the model
history = model.fit(
    train_texts, train_labels,
    epochs=100,  # You can adjust the number of epochs
    batch_size=64,
    validation_data=(val_texts, val_labels)
)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(test_texts, test_labels)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7