Ejemplo de como usar el modelo LDA. Se necesitan las siguientes librerias:

In [1]:
!pip install nltk gensim joblib spacy



Los imports necesarios:


In [2]:
import re
import string
import json
import pickle
from gensim import corpora
from gensim.models import LdaMulticore
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Download NLTK resources if needed
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

Y se tiene que crear la siguiente clase:

In [3]:
class LDAModelWrapper:
    def __init__(self, lda_model, dictionary, config):
        """
        Initializes the LDAModelWrapper with a trained LDA model, dictionary, and configuration.

        Parameters:
        - lda_model: A trained Gensim LdaMulticore model.
        - dictionary: A Gensim Dictionary object used for the model.
        - config: Configuration dictionary or JSON loaded parameters.
        """
        self.lda_model = lda_model
        self.dictionary = dictionary
        self.config = config
        self.stemming = config.get('stemming', False)
        self.preprocess_library = config.get('preprocess_library', 'nltk')
        self.no_above = config.get('no_above', 0.5)
        self.no_below = config.get('no_below', 5)
        self.topics_dict = config.get('topics_dict', {})  # Load the topic labels dictionary from config


    def preprocess_text(self, text):
        """
        Preprocesses the input text based on configuration settings.

        Parameters:
        - text: A string representing the text to preprocess.

        Returns:
        - A list of processed tokens.
        """
        # Convert text to lowercase
        text = text.lower()

        # Remove punctuation and numbers
        text = re.sub(r'[\d]', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))

        if self.preprocess_library == "nltk":
            # Tokenize the text
            tokens = word_tokenize(text)

            # Remove stopwords using NLTK
            stop_words_nltk = set(stopwords.words('english'))
            tokens = [word for word in tokens if word not in stop_words_nltk]

            # Apply lemmatization
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(word) for word in tokens]

        elif self.preprocess_library == "spacy":
            # Process the text with spaCy
            doc = nlp(text)

            # Filter and lemmatize tokens
            tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and not token.is_punct and not token.is_space]
        else:
            # If invalid preprocess library, raise an error
            raise ValueError("Invalid preprocess library, must be -> ['nltk', 'spacy']")

        # Apply stemming if required
        if self.stemming:
            stemmer = SnowballStemmer("english")
            tokens = [stemmer.stem(word) for word in tokens]

        return tokens

    def get_topic_scores(self, text, threshold=0.1, verbose=False):
        """
        Gets the topic distribution and scores for an unseen document.

        Parameters:
        - text: The unseen document as a string.
        - threshold: Minimum score threshold to display topics.

        Returns:
        - A list of tuples containing topic indices and their scores.
        """
        # Preprocess the unseen document
        tokens = self.preprocess_text(text)
        bow_vector = self.dictionary.doc2bow(tokens)

        # Get topic distribution for the document
        topic_scores = sorted(self.lda_model[bow_vector], key=lambda tup: -tup[1])

        # Filter topics based on the threshold and map to human-readable labels
        filtered_topics = [(self.topics_dict.get(index, f"Topic {index}"), score)
                           for index, score in topic_scores if score >= threshold]

        # Print the topics and scores IF verbose flag added
        if verbose:
          for label, score in filtered_topics:
              print(f"Score: {score}\nTopic: {label}\n")

        return filtered_topics

    def save_model(self, filepath):
        """
        Saves the LDA model and dictionary to a file.

        Parameters:
        - filepath: The path to save the model file.
        """
        with open(filepath, 'wb') as file:
            pickle.dump({'lda_model': self.lda_model, 'dictionary': self.dictionary, 'config': self.config}, file)
        print(f"Model saved to {filepath}")

    @classmethod
    def load_model(cls, filepath):
        """
        Loads the LDA model and dictionary from a file.

        Parameters:
        - filepath: The path to the model file to load.

        Returns:
        - An instance of LDAModelWrapper.
        """
        with open(filepath, 'rb') as file:
            data = pickle.load(file)
        print(f"Model loaded from {filepath}")
        return cls(data['lda_model'], data['dictionary'], data['config'])

Ahora, a partir del archivo *lda_model.pkl* se carga en la clase creada.

In [4]:
file_path = "lda_model.pkl" #Cambiar con la ruta del archivo

#Generar una instancia de la clase a partir del archivo con el metodo load_model de la clase
lda_wraper = LDAModelWrapper.load_model(filepath=file_path)

Model loaded from lda_model.pkl


Ahora, ya se puede hacer inferencia con ese modelo de la siguiente forma:

In [5]:
texto_a_inferir_tema = "This is an example of text. Let's say I want to talk about university, college, research or investigation. For example, in my university class there are teachers who impart lectures about machine learning."

lda_wraper.get_topic_scores(text=texto_a_inferir_tema)

[('Education', 0.7231515), ('Justice', 0.12359578), ('Economy', 0.11954031)]