# Loading the models for categorizing

In [1]:
from gensim.models import LdaModel, Phrases
from gensim.corpora import Dictionary

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

### Loading the previously saved models

In [2]:
dictionary = Dictionary.load('dicionario_1')
phraser = Phrases.load('frases_1')
model = LdaModel.load('modelo_1')

### Apply the same preprocessing to incoming texts

In [3]:
text = "I am trying to learn Machine Learning"
docs = [text]
print("Text before preprocessing\n", docs)

# Tokenize the documents.
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
print("Text after preprocessing\n", docs)

Text before preprocessing
 ['I am trying to learn Machine Learning']
Text after preprocessing
 [['am', 'trying', 'to', 'learn', 'machine', 'learning']]


In [4]:
doc = docs[0]
for token in phraser[doc]:
    if '_' in token:
        # Token is a bigram, add to document.
        doc.append(token)
print("Document after adding n-grams\n", doc)

Document after adding n-grams
 ['am', 'trying', 'to', 'learn', 'machine', 'learning', 'machine_learning']




### Use the loaded model to infer the document's topics

In [5]:
def normalize_word_topic(word_from_topic):
    if '_' in word_from_topic:
        words = word_from_topic.split('_')
        if words[0] == words[1]:
            return words[0]
        else:
            return ' '.join(words)
    else:
        return word_from_topic

In [6]:
bow = dictionary.doc2bow(doc)
# Os topics retornados aqui são tuplas de valores (tópico, score do documento no tópico)
topics_scores = model[bow]
print("The document's learned topics\n", topics_scores)
best_topic = max(topics_scores, key=lambda topic_score:topic_score[1])
topic = model.show_topic(best_topic[0])
print("The document's category's topic words\n", [normalize_word_topic(word_probability[0]) for word_probability in topic])

The document's learned topics
 [(1, 0.93008476448617583), (3, 0.0108284278054121)]
The document's category's topic words
 ['rule', 'robot', 'node', 'architecture', 'learn', 'position', 'action', 'environment', 'learned', 'control']
