In [65]:
import spacy
from pprint import pprint
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel

from dbConnect import singleQuery

In [66]:
nlp = spacy.load("es_core_news_sm")

In [73]:
def tokenize(docs: list[str]) -> list[list[str]]:
    """
    Tokenize news, removing no wished chars (numers, for example).

    Pre: body of news as list of strings
    Post: return a list of lists that represents each body of news 
    """
    #tokenizer = RegexpTokenizer(r'\w+')
    tokenizer = RegexpTokenizer(r'\w+|[áéíóúÁÉÍÓÚñÑüÜ]+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]

    return docs


def lemmatize(docs):
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    return docs


def lemmatize_es(docs):
    # https://spacy.io/usage/models
    nlp = spacy.load("es_core_news_sm")
    lemmatized_docs = []
    for doc in docs:
        lemmatized_doc = [token.lemma_ for token in nlp(" ".join(doc))]
        lemmatized_docs.append(lemmatized_doc)
    return lemmatized_docs


def ngrams(docs):
    bigram = Phrases(docs, min_count=20)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)
    return docs


def bag_of_words(docs):
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.8)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus


def train_lda(dictionary, corpus):
    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 20
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make an index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    top_topics = model.top_topics(corpus)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    pprint(top_topics)

In [71]:
spa = None

with open('cnn.txt', 'rt', encoding='utf8') as file:
    spa = [x.strip() for x in file]

In [72]:
bodies = tokenize(spa)
bodies = lemmatize(bodies)
bodies = ngrams(bodies)
dictionary, corpus = bag_of_words(bodies)
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

train_lda(dictionary, corpus)

Number of unique tokens: 0
Number of documents: 38


KeyError: 0

In [69]:
noticias = singleQuery('SELECT titulo,cuerpo,fuente FROM noticias LIMIT 5000;')

In [70]:
bodies = [body for _, body, _ in noticias]
bodies = tokenize(bodies)
bodies = lemmatize_es(bodies)
bodies = ngrams(bodies)
dictionary, corpus = bag_of_words(bodies)
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
train_lda(dictionary, corpus)

Number of unique tokens: 8304
Number of documents: 5000
Average topic coherence: -0.9590.
[([(0.039896023, 'yo'),
   (0.015275833, 'hacer'),
   (0.01390909, 'todo'),
   (0.013580222, 'mucho'),
   (0.013564318, 'ese'),
   (0.012565902, 'decir'),
   (0.01188011, 'ir'),
   (0.011872036, 'mi'),
   (0.01164261, 'pero'),
   (0.008752707, 'porque'),
   (0.008735408, 'tú'),
   (0.00850746, 'más'),
   (0.007437887, 'cuando'),
   (0.007169367, 'poder'),
   (0.0067767184, 'ver'),
   (0.006546261, 'querer'),
   (0.006243128, 'si'),
   (0.006062277, 'año'),
   (0.0059497813, 'saber'),
   (0.0059002656, 'pasar')],
  -0.5531276223687694),
 ([(0.014508255, 'más'),
   (0.0130036045, 'poder'),
   (0.008436715, 'año'),
   (0.007897847, 'mucho'),
   (0.0075886957, 'otro'),
   (0.007163707, 'todo'),
   (0.0071431547, 'pero'),
   (0.006974974, 'ese'),
   (0.006793707, 'hacer'),
   (0.0065933596, 'también'),
   (0.004400888, 'entre'),
   (0.0043065627, 'alguno'),
   (0.004202656, 'sobre'),
   (0.0040117605, 