# Usando el modelo entrenado para predecir categorías de nuevas preguntas

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
# Cargar datos de prueba
df = pd.read_csv('data/augmented_questions_vector_category_chapter.csv')

In [2]:

# Ejemplo 3: Función optimizada que carga modelos una sola vez
class HierarchicalPredictor:
    """Predictor que carga los modelos una sola vez para múltiples predicciones"""

    def __init__(self, models_path='models_hierarchical/'):
        self.models_path = models_path
        self._load_models()

    def _load_models(self):
        """Carga todos los modelos una sola vez"""
        with open(f'{self.models_path}main_category_model.pkl', 'rb') as f:
            self.main_model = pickle.load(f)
        with open(f'{self.models_path}pca_transformer.pkl', 'rb') as f:
            self.pca = pickle.load(f)
        with open(f'{self.models_path}category_encoder.pkl', 'rb') as f:
            self.cat_encoder = pickle.load(f)
        with open(f'{self.models_path}theme_encoders.pkl', 'rb') as f:
            self.theme_encoders = pickle.load(f)

        # Cargar modelos de temas
        self.theme_models = {}
        for category in [1, 2, 3]:
            path = f'{self.models_path}theme_model_categoria_{category}.pkl'
            if os.path.exists(path):
                with open(path, 'rb') as f:
                    self.theme_models[category] = pickle.load(f)

    def predict(self, embedding_string):
        """Predice categoría y tema"""
        # Convertir y ajustar embedding
        embedding = np.array([float(x.strip()) for x in embedding_string.strip('[]').split(',') if x.strip()])

        expected_size = self.pca.n_features_in_
        if len(embedding) < expected_size:
            embedding = np.pad(embedding, (0, expected_size - len(embedding)), mode='constant')
        elif len(embedding) > expected_size:
            embedding = embedding[:expected_size]

        # PCA y predicción de categoría
        embedding_pca = self.pca.transform(embedding.reshape(1, -1))
        category_pred = self.main_model.predict(embedding_pca)[0]
        category = self.cat_encoder.inverse_transform([category_pred])[0]

        # Predicción de tema
        theme = None
        if category in self.theme_models and category in self.theme_encoders:
            theme_pred = self.theme_models[category].predict(embedding_pca)[0]
            theme = self.theme_encoders[category].inverse_transform([theme_pred])[0]

        return {'category': int(category), 'theme': int(theme) if theme else None}

# Uso del predictor optimizado
predictor = HierarchicalPredictor()

# Predecir múltiples muestras rápidamente
for i in range(5):
    embedding = df.iloc[i]['vector']
    pred = predictor.predict(embedding)
    real_cat = df.iloc[i]['category']
    real_theme = df.iloc[i]['chapter']

    cat_ok = pred['category'] == real_cat
    theme_ok = pred['theme'] == real_theme

    print(f"Muestra {i}: Cat({real_cat}→{pred['category']}) {'✓' if cat_ok else '✗'} | "
          f"Tema({real_theme}→{pred['theme']}) {'✓' if theme_ok else '✗'}")

Muestra 0: Cat(1→1) ✓ | Tema(26→26) ✓
Muestra 1: Cat(1→1) ✓ | Tema(26→26) ✓
Muestra 2: Cat(1→1) ✓ | Tema(26→26) ✓
Muestra 3: Cat(1→1) ✓ | Tema(26→26) ✓
Muestra 4: Cat(1→1) ✓ | Tema(26→26) ✓
