In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
!pip install -q gradio
import gradio as gr

In [3]:
# Leer el archivo CSV
url = 'https://raw.githubusercontent.com/sevann-radhak/procesamiento_lenguaje_natural/main/clase_2/ejercicios/raw/True.csv'
news_df = pd.read_csv(url)

In [4]:
# Seleccionar una muestra aleatoria de los datos
news_sample = news_df.sample(n=100, random_state=42)

In [5]:
# Combinar títulos y textos
news_texts = news_sample['title'] + ' ' + news_sample['text']

In [6]:
def split_text_into_segments(text, num_segments=4):
    words = text.split()
    segment_length = len(words) // num_segments
    segments = [' '.join(words[i*segment_length:(i+1)*segment_length]) for i in range(num_segments)]
    # If there are remaining words, add them to the last segment
    if len(words) % num_segments != 0:
        segments[-1] += ' ' + ' '.join(words[num_segments*segment_length:])
    return segments

# Dividir cada texto en segmentos
segmented_texts = []
for text in news_texts:
    segments = split_text_into_segments(text)
    segmented_texts.extend(segments)

In [7]:
# Configuración del Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(segmented_texts)
total_words = len(tokenizer.word_index) + 1

In [8]:
# Crear secuencias de entrada y salida
input_sequences = []
for line in segmented_texts:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [None]:
# #TODO:
# # Determinación del tamaño máximo de contexto
# max_context_size = np.percentile([len(seq) for seq in input_sequences], 90)
# max_context_size = int(max_context_size)

In [9]:
# Ajustar el tamaño de las secuencias
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

In [10]:
# Crear los conjuntos de datos de entrada y salida
input_sequences = np.array(input_sequences)
X, y = input_sequences[:,:-1], input_sequences[:,-1]

In [11]:
# Convertir las etiquetas a una forma categórica
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Definir el modelo
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))



In [14]:
# Compilar el modelo
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# Entrenar el modelo
# history = model.fit(X, y, epochs=10, verbose=1)

In [16]:
# Callback para cálculo de perplejidad y early stopping
class PplCallback(keras.callbacks.Callback):
    def __init__(self, patience=5):
        super(PplCallback, self).__init__()
        self.patience = patience
        self.best_ppl = np.inf
        self.wait = 0

    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get('val_loss')
        val_ppl = np.exp(val_loss)
        print(f'Perplejidad en validación: {val_ppl:.4f}')

        if val_ppl < self.best_ppl:
            self.best_ppl = val_ppl
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.model.stop_training = True

In [17]:
ppl_callback = PplCallback(patience=3)

In [18]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), callbacks=[ppl_callback])

Epoch 1/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 225ms/step - accuracy: 0.0512 - loss: 7.5132Perplejidad en validación: 1275.0141
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 242ms/step - accuracy: 0.0512 - loss: 7.5129 - val_accuracy: 0.0595 - val_loss: 7.1507
Epoch 2/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step - accuracy: 0.0625 - loss: 6.8396Perplejidad en validación: 1210.3274
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 234ms/step - accuracy: 0.0625 - loss: 6.8396 - val_accuracy: 0.0741 - val_loss: 7.0986
Epoch 3/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step - accuracy: 0.0816 - loss: 6.5482Perplejidad en validación: 1122.9271
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 231ms/step - accuracy: 0.0816 - loss: 6.5482 - val_accuracy: 0.0797 - val_loss: 7.0237
Epoch 4/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [24]:
# Función para generar texto
def generate_text(seed_text, next_words, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_probs = np.log(predicted_probs + 1e-7) / temperature
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
        predicted = np.random.choice(range(total_words), p=predicted_probs)
        output_word = tokenizer.index_word[predicted]
        seed_text += " " + output_word
    return seed_text

In [25]:
# Interfaz con Gradio
def gradio_interface(seed_text, next_words, temperature):
    return generate_text(seed_text, next_words, temperature)

In [27]:
# Crear la interfaz de Gradio
iface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(lines=2, placeholder='Introduce el texto inicial aquí...'),
    outputs='text'
)



In [28]:
# Ejecutar la interfaz
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://c9a3d3dbd0ab256e46.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [26]:
gr_interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.inputs.Textbox(lines=2, placeholder='Introduce el texto inicial aquí...'),
        gr.inputs.Slider(1, 100, step=1, label='Número de palabras a generar'),
        gr.inputs.Slider(0.1, 2.0, step=0.1, label='Temperatura')
    ],
    outputs='text',
    title='Generador de Noticias',
    description='Genera noticias a partir de un texto inicial.'
)

AttributeError: module 'gradio' has no attribute 'inputs'

In [None]:
gr_interface.launch()

In [19]:
# # Preprocesamiento de los textos
# news_texts = news_df['text'].astype(str).tolist()

In [20]:
# # Tokenización de los textos
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(news_texts)
# total_words = len(tokenizer.word_index) + 1

In [21]:
# # Creación de secuencias de entrada y salida
# input_sequences = []
# for line in news_texts:
#     token_list = tokenizer.texts_to_sequences([line])[0]
#     for i in range(1, len(token_list)):
#         n_gram_sequence = token_list[:i+1]
#         input_sequences.append(n_gram_sequence)

In [22]:
# # Padding de las secuencias
# input_sequences = pad_sequences(input_sequences, maxlen=max_context_size, padding='pre')