In [120]:
import gensim
from gensim.utils import simple_preprocess
import os
import random
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
from gensim.models import Word2Vec

In [176]:
# Constants
group_code = "santiagomartinez_201533279_camilocastaneda_202314092"
segment_length = 200

In [177]:
def split_text_into_segments(text, segment_length=150):
    # Tokenize the text using Gensim's simple_preprocess
    tokens = simple_preprocess(text)

    segments = []
    current_segment = []

    for token in tokens:
        current_segment.append(token)

        if len(current_segment) == segment_length:
            segments.append(current_segment)
            current_segment = []

    # Ensure all segments have the same length (discard segments of different lengths)
    segments = [segment for segment in segments if len(segment) == segment_length]
    return segments

[]


In [178]:
def get_segment_embedding(segment, model):
    # Get embeddings for each word in the segment
    word_embeddings = [model.wv[word] for word in segment if word in model.wv]
    
    # Calculate the mean of the word embeddings
    if word_embeddings:
        segment_embedding = np.mean(word_embeddings, axis=0)
    else:
        # If no word in the segment is in the model's vocabulary, return zeros
        segment_embedding = np.zeros(model.vector_size)
    
    return segment_embedding

In [179]:
output_dim = 128
# Load the saved Word2Vec model
gensim_model = Word2Vec.load(f"Books_{output_dim}_{group_code}.model")
# Get authors
authors = os.listdir("book_datasets")
# Initialize lists to store text segments and their corresponding labels
segments = []
labels = []
# Process the text files
for author in authors:
    folder_path = f"book_datasets/{author}"
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
            text = file.read()
            # Split the text into segments of segment_length words
            doc_segments = split_text_into_segments(text, segment_length=segment_length)
            for doc_segment in doc_segments:
                # Check if the segments are of the correct length before extending the lists
                if len(doc_segment) == segment_length:
                    segment_embedding = get_segment_embedding(doc_segment, gensim_model)
                    segments.append(segment_embedding)
                    labels.append([author])
                else:
                    print(f"Discarding segments for {author} from {filename} due to incorrect length {len(doc_segments)}.")


# Shuffle the data
data = list(zip(segments, labels))
random.shuffle(data)
segments, labels = zip(*data)

# Split the dataset into training, validation, and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(segments, labels, test_size=0.2, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(test_data, test_labels, test_size=0.5, random_state=42)

num_classes = len(np.unique(train_labels))
# Summary table with dimensions
print(f"Training Set: {len(train_data)} samples")
print(f"Validation Set: {len(val_data)} samples")
print(f"Testing Set: {len(test_data)} samples")

Training Set: 2145 samples
Validation Set: 268 samples
Testing Set: 269 samples


 se utilizó un modelo Word2Vec de Gensim para obtener representaciones vectoriales de palabras a partir de los textos. Estas representaciones se utilizaron para calcular representaciones vectoriales promedio para cada segmento de palabras. De esta forma se genera el dataset

In [180]:
# Count the number of samples for each class in the training set
train_class_counts = {author: train_labels.count([author]) for author in authors}

# Count the number of samples for each class in the validation set
val_class_counts = {author: val_labels.count([author]) for author in authors}

# Count the number of samples for each class in the test set
test_class_counts = {author: test_labels.count([author]) for author in authors}

# Print the summary table
print("Summary Table - Number of Samples by Class:")
print(f"{'Author': <20} {'Training Set': <15} {'Validation Set': <15} {'Test Set': <15}")
for author in authors:
    print(f"{author: <20} {train_class_counts[author]: <15} {val_class_counts[author]: <15} {test_class_counts[author]: <15}")

Summary Table - Number of Samples by Class:
Author               Training Set    Validation Set  Test Set       
Charles_Dickens      1173            138             145            
Oscar_Wilde          571             71              75             
William_Shakespeare  401             59              49             


In [181]:
idx_explore = 2000
print(train_data[idx_explore][:10])
print()
print("length of chunk")
print(len(train_data[idx_explore]))
print("# of chunks")
print(len(train_data))
print("label")
print(train_labels[idx_explore])

[-0.226665    0.16544892  0.18007249  0.55251825 -0.10265844 -0.460437
  0.3803783  -0.47589242 -0.01169075 -0.14474805]

length of chunk
128
# of chunks
2145
label
['Charles_Dickens']


In [182]:
import numpy as np
np.unique(train_labels)

array(['Charles_Dickens', 'Oscar_Wilde', 'William_Shakespeare'],
      dtype='<U19')

In [183]:
len(train_labels)

2145

In [184]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder on all labels (train, validation, and test)
all_labels = train_labels + val_labels + test_labels
label_encoder.fit(all_labels)

# Transform the labels to numerical values
train_labels_encoded = label_encoder.transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# One-hot encode the labels
num_classes = len(label_encoder.classes_)
train_labels_encoded = to_categorical(train_labels_encoded, num_classes=num_classes)
val_labels_encoded = to_categorical(val_labels_encoded, num_classes=num_classes)
test_labels_encoded = to_categorical(test_labels_encoded, num_classes=num_classes)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [185]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten

# Create the Keras model
model1 = Sequential()
model1.add(Dense(50, activation='relu', input_shape=(output_dim,)))
model1.add(Dense(20, activation='relu'))
model1.add(Dense(num_classes, activation='softmax'))

# Compile the model
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model1.summary()

Model: "sequential_44"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_154 (Dense)           (None, 50)                6450      
                                                                 
 dense_155 (Dense)           (None, 20)                1020      
                                                                 
 dense_156 (Dense)           (None, 3)                 63        
                                                                 


Total params: 7533 (29.43 KB)
Trainable params: 7533 (29.43 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [186]:
# Train the model
model1.fit(np.array(train_data), train_labels_encoded, epochs=100, batch_size=64, validation_data=(np.array(val_data), val_labels_encoded))

# Evaluate the model on the test data
loss, accuracy = model1.evaluate(np.array(test_data), test_labels_encoded)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [187]:
# Create the Keras model
model2 = Sequential()
model2.add(Dense(128, activation='relu', input_shape=(output_dim,)))
model2.add(Dense(128, activation='relu'))
model2.add(Dense(num_classes, activation='softmax'))

# Compile the model
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model2.summary()

Model: "sequential_45"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_157 (Dense)           (None, 128)               16512     
                                                                 
 dense_158 (Dense)           (None, 128)               16512     
                                                                 
 dense_159 (Dense)           (None, 3)                 387       
                                                                 
Total params: 33411 (130.51 KB)
Trainable params: 33411 (130.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [188]:
# Train the model
model2.fit(np.array(train_data), train_labels_encoded, epochs=100, batch_size=64, validation_data=(np.array(val_data), val_labels_encoded))

# Evaluate the model on the test data
loss, accuracy = model2.evaluate(np.array(test_data), test_labels_encoded)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [189]:
# Create the Keras model
model3 = Sequential()
model3.add(Dense(128, activation='relu', input_shape=(output_dim,)))
model3.add(Dense(64, activation='relu'))
model3.add(Dense(32, activation='relu'))
model3.add(Dense(16, activation='relu'))
model3.add(Dense(num_classes, activation='softmax'))

# Compile the model
model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model3.summary()

Model: "sequential_46"
_________________________________________________________________


 Layer (type)                Output Shape              Param #   
 dense_160 (Dense)           (None, 128)               16512     
                                                                 
 dense_161 (Dense)           (None, 64)                8256      
                                                                 
 dense_162 (Dense)           (None, 32)                2080      
                                                                 
 dense_163 (Dense)           (None, 16)                528       
                                                                 
 dense_164 (Dense)           (None, 3)                 51        
                                                                 
Total params: 27427 (107.14 KB)
Trainable params: 27427 (107.14 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [190]:
# Train the model
model3.fit(np.array(train_data), train_labels_encoded, epochs=100, batch_size=64, validation_data=(np.array(val_data), val_labels_encoded))

# Evaluate the model on the test data
loss, accuracy = model3.evaluate(np.array(test_data), test_labels_encoded)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [191]:
output_dim = 256 ## CAMBIAR PARA OBTENER RESULTADOS CON DISTINTOS EMBEDDINGS [64, 128, 256]

def split_text_into_segments(text, segment_length=150):
    # Tokenize the text using Gensim's simple_preprocess
    tokens = simple_preprocess(text)

    segments = []
    current_segment = []

    for token in tokens:
        current_segment.append(token)

        if len(current_segment) == segment_length:
            segments.append(current_segment)
            current_segment = []

    # Ensure all segments have the same length (discard segments of different lengths)
    segments = [segment for segment in segments if len(segment) == segment_length]
    return segments

# Load the saved Word2Vec model
gensim_model = Word2Vec.load(f"Books_{output_dim}_{group_code}.model")
# Get authors
authors = os.listdir("book_datasets")
# Initialize lists to store text segments and their corresponding labels
segments = []
def get_segment_embedding(segment, model):
    # Get embeddings for each word in the segment
    word_embeddings = [model.wv[word] for word in segment if word in model.wv]
    
    # Calculate the mean of the word embeddings
    if word_embeddings:
        segment_embedding = np.mean(word_embeddings, axis=0)
    else:
        # If no word in the segment is in the model's vocabulary, return zeros
        segment_embedding = np.zeros(model.vector_size)
    
    return segment_embedding

labels = []
# Process the text files
for author in authors:
    folder_path = f"book_datasets/{author}"
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
            text = file.read()
            # Split the text into segments of 150 to 250 words (you may need a text processing library)
            doc_segments = split_text_into_segments(text, segment_length=segment_length)
            for doc_segment in doc_segments:
                # Check if the segments are of the correct length before extending the lists
                if len(doc_segment) == segment_length:
                    segment_embedding = get_segment_embedding(doc_segment, gensim_model)
                    segments.append(segment_embedding)
                    labels.append([author])
                else:
                    print(f"Discarding segments for {author} from {filename} due to incorrect length {len(doc_segments)}.")


# Shuffle the data
data = list(zip(segments, labels))
random.shuffle(data)
segments, labels = zip(*data)

# Split the dataset into training, validation, and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(segments, labels, test_size=0.2, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(test_data, test_labels, test_size=0.5, random_state=42)

num_classes = len(np.unique(train_labels))

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder on all labels (train, validation, and test)
all_labels = train_labels + val_labels + test_labels
label_encoder.fit(all_labels)

# Transform the labels to numerical values
train_labels_encoded = label_encoder.transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# One-hot encode the labels
num_classes = len(label_encoder.classes_)
train_labels_encoded = to_categorical(train_labels_encoded, num_classes=num_classes)
val_labels_encoded = to_categorical(val_labels_encoded, num_classes=num_classes)
test_labels_encoded = to_categorical(test_labels_encoded, num_classes=num_classes)

# Create the Keras model
model1 = Sequential()
model1.add(Dense(50, activation='relu', input_shape=(output_dim,)))
model1.add(Dense(20, activation='relu'))
model1.add(Dense(num_classes, activation='softmax'))

# Compile the model
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model1.fit(np.array(train_data), train_labels_encoded, epochs=100, batch_size=64, validation_data=(np.array(val_data), val_labels_encoded), verbose=0)

# Evaluate the model on the test data
loss, accuracy = model1.evaluate(np.array(test_data), test_labels_encoded)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Create the Keras model
model2 = Sequential()
model2.add(Dense(128, activation='relu', input_shape=(output_dim,)))
model2.add(Dense(128, activation='relu'))
model2.add(Dense(num_classes, activation='softmax'))

# Compile the model
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model2.fit(np.array(train_data), train_labels_encoded, epochs=100, batch_size=64, validation_data=(np.array(val_data), val_labels_encoded), verbose=0)

# Evaluate the model on the test data
loss, accuracy = model2.evaluate(np.array(test_data), test_labels_encoded)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Create the Keras model
model3 = Sequential()
model3.add(Dense(128, activation='relu', input_shape=(output_dim,)))
model3.add(Dense(64, activation='relu'))
model3.add(Dense(32, activation='relu'))
model3.add(Dense(16, activation='relu'))
model3.add(Dense(num_classes, activation='softmax'))

# Compile the model
model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) 

# Train the model
model3.fit(np.array(train_data), train_labels_encoded, epochs=100, batch_size=64, validation_data=(np.array(val_data), val_labels_encoded), verbose=0)

# Evaluate the model on the test data
loss, accuracy = model3.evaluate(np.array(test_data), test_labels_encoded)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Test Loss: 0.0915
Test Accuracy: 96.28%
Test Loss: 0.1353
Test Accuracy: 94.05%
Test Loss: 0.1261
Test Accuracy: 95.91%


resultados embedding 256:

model1:
- loss: 0.1520 - accuracy: 0.9517
- Test Loss: 0.1520
- Test Accuracy: 95.17%

model2:
- loss: loss: 0.1822 - accuracy: 0.9554
- Test Loss: 0.1822
- Test Accuracy: 95.54%

model:3
- loss: 0.2002 - accuracy: 0.9368
- Test Loss: 0.1695
- Test Accuracy: 95.91%

______________________

resultados embedding 128:

model1:
- loss: 0.0681 - accuracy: 0.9703
- Test Loss: 0.0681
- Test Accuracy: 97.03%

model2:
- loss: 0.0522 - accuracy: 0.9814
- Test Loss: 0.0522
- Test Accuracy: 98.14%

model:3
- loss: 0.0540 - accuracy: 0.9740
- Test Loss: 0.0540
- Test Accuracy: 97.40%

______________________

resultados embedding 64:

model1:
- loss: 0.1785 - accuracy: 0.9517
- Test Loss: 0.1785
- Test Accuracy: 95.17%

model2:
- loss: 0.2243 - accuracy: 0.9368
- Test Loss: 0.2243
- Test Accuracy: 93.68%

model:3
- loss: 0.2002 - accuracy: 0.9368
- Test Loss: 0.2002
- Test Accuracy: 93.68%

Podemos cocluir que el embedding que más sirvió para la tarea de clasificación fue el de 128, dado que consistentemente para varias arquitecturas tuvo un mejor desempeño en test. Sin embargo al correr varias pruebas, probablemente debido a variables aleatorias, el embedding de 256 puede tener un mejor desempeño, con el costo de algunos parámetros extra
