# RED NEURONAL PARA PREDECIR GENERO BASADO EN LA DESCRIPCION

In [6]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Cargar el dataframe
data = pd.read_csv('./dataset/train.csv')

# 1. Preprocesamiento de texto
def convert_to_numeric(value):
    if isinstance(value, str):
        if 'K' in value:
            return float(value.replace('K', '')) * 1_000
        elif 'M' in value:
            return float(value.replace('M', '')) * 1_000_000
    return float(value)

def preprocess_data(data):
    data["Number of Reviews"] = data['Number of Reviews'].apply(convert_to_numeric)
    data["Wishlist"] = data["Wishlist"].apply(convert_to_numeric)

    # Asegurar que todos los valores de la columna 'Summary' sean cadenas
    data["Summary"] = data["Summary"].fillna("").astype(str)

    # Crear y ajustar el tokenizer
    tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
    tokenizer.fit_on_texts(data["Summary"])
    sequences = tokenizer.texts_to_sequences(data["Summary"])
    word_index = tokenizer.word_index

    # Verificar las secuencias creadas
    print(sequences[:5])
    word_index = tokenizer.word_index
    max_length = 50  # Longitud máxima de secuencia
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

    # Codificar las etiquetas
    label_encoder = LabelEncoder()
    data["Genre"] = label_encoder.fit_transform(data["Genre"])
    labels = to_categorical(data["Genre"])
    return data, padded_sequences, labels, max_length

data = preprocess_data(data)

# Dividir datos en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)

# 2. Definir el modelo
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(labels.shape[1], activation='softmax')
])

# 3. Compilar el modelo
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# 4. Entrenar el modelo
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    verbose=2
)

# 5. Evaluar el modelo
loss, accuracy = model.evaluate(X_val, y_val, verbose=2)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


[[5554, 1132, 4, 2, 5555, 9, 532, 11, 2, 1037, 3001, 21, 253, 2, 332, 260, 41, 46, 4, 2, 3847, 190, 3848, 8, 8, 8, 75, 149, 4, 51, 4, 174, 47, 21, 25, 33, 5556, 54, 193, 37, 270, 49, 2, 533, 5557, 5558, 5559, 2, 950, 1796, 3002, 5560, 2, 375, 199, 3849, 5561, 3, 5562, 63, 25, 175, 43, 5, 3003, 3, 75, 6, 2, 1797, 8, 8, 709, 14, 25, 55, 388, 1038, 2078, 13, 277, 14, 951, 11, 2, 47, 14, 638, 8, 8, 8, 28, 1564, 9, 2, 504, 476, 141, 10, 5563, 12, 5, 3850, 408, 4, 2, 1565, 3004, 357, 16, 5564, 5565, 4, 2, 465, 5566, 4, 2, 3004, 357, 23, 5567, 6, 3005, 2, 3004, 3006, 16, 1798], [1133, 23, 1378, 19, 271, 3, 2, 84, 882, 9, 19, 639, 4, 2079, 883, 11, 17, 389, 5568, 1566, 4, 952, 5569, 2079, 76, 5570, 3, 5571, 2466, 234, 466, 6, 1134, 317, 3, 3007, 884, 6, 429, 3008, 4, 1039, 4, 1133, 231, 7, 35, 953], [430, 37, 5572, 9, 2, 127, 3, 118, 376, 7, 2, 430, 351, 30, 26, 9, 5, 38, 600, 333, 1040, 318, 557, 11, 388, 1038, 21, 1567, 24, 449, 3, 225, 1799], [291, 254, 2, 39, 4, 390, 9, 2, 261, 1568, 226, 



276/276 - 7s - 27ms/step - accuracy: 0.1251 - loss: 2.6816 - val_accuracy: 0.1682 - val_loss: 2.4423
Epoch 2/10
276/276 - 5s - 17ms/step - accuracy: 0.1437 - loss: 2.5026 - val_accuracy: 0.1682 - val_loss: 2.4250
Epoch 3/10
276/276 - 5s - 18ms/step - accuracy: 0.1490 - loss: 2.4604 - val_accuracy: 0.1673 - val_loss: 2.4176
Epoch 4/10
276/276 - 5s - 17ms/step - accuracy: 0.1572 - loss: 2.4328 - val_accuracy: 0.1682 - val_loss: 2.4152
Epoch 5/10
276/276 - 7s - 25ms/step - accuracy: 0.1656 - loss: 2.3991 - val_accuracy: 0.1600 - val_loss: 2.4183
Epoch 6/10
276/276 - 8s - 30ms/step - accuracy: 0.1760 - loss: 2.3724 - val_accuracy: 0.1636 - val_loss: 2.4440
Epoch 7/10
276/276 - 9s - 31ms/step - accuracy: 0.1843 - loss: 2.3377 - val_accuracy: 0.1650 - val_loss: 2.4798
Epoch 8/10
276/276 - 9s - 33ms/step - accuracy: 0.1956 - loss: 2.3149 - val_accuracy: 0.1646 - val_loss: 2.4921
Epoch 9/10
276/276 - 6s - 20ms/step - accuracy: 0.2032 - loss: 2.2900 - val_accuracy: 0.1659 - val_loss: 2.5539
Epo

In [None]:
# Load the test data
test_data = pd.read_csv('./dataset/test.csv')

# Preprocess the 'Summary' column in the test data

# Convert the test 'Summary' to sequences and pad them
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Make predictions
predictions = model.predict(test_padded_sequences, verbose=1)

# Convert predictions to genre labels
predicted_genres = label_encoder.inverse_transform(predictions.argmax(axis=1))

# Save results to a new CSV file
output = pd.DataFrame({
    'id': test_data['id'], 
    'Genre': predicted_genres
})
output.to_csv('./dataset/predicted_genres.csv', index=False)

print("Predictions saved to './dataset/predicted_genres.csv'")


KeyError: 'Summary'