In [1]:
#nltk.download('stopwords')
#nltk.download('punkt')
#!python -m spacy download es_core_news_md
#!python -m spacy validate

import warnings
import pandas as pd
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None  # default='warn'

from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm.pandas()
import numpy as np

from utils import clean_dataset_basedOn_media
from utils import cluster_by_month

In [2]:
df = pd.read_csv("../data/loslagos-comunas.csv")
df = cluster_by_month(clean_dataset_basedOn_media(df))
df.isna().any()

date               False
media_outlet       False
url                False
title              False
text               False
content            False
comuna              True
date_clustering    False
dtype: bool

In [3]:
df.date_clustering.value_counts()

2022-03    4617
2021-12    4492
2021-10    4370
2021-11    4326
2022-01    4182
2022-02    3950
2022-04    3384
Name: date_clustering, dtype: int64

In [4]:
# Obtenemos las etiquetas del value_counts 
months = df.date_clustering.value_counts().index.tolist()

# se hará un análisis del primer mes
import datetime
dates = [datetime.datetime.strptime(ts, "%Y-%m") for ts in months]
dates.sort()
sorteddates = [datetime.datetime.strftime(ts, "%Y-%m") for ts in dates]

In [5]:
selected = df[df.date_clustering == sorteddates[0]]
docs = selected.content.tolist()

# [BERTopic](https://maartengr.github.io/BERTopic/index.html)
BERTopic is a topic modeling technique that leverages transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

In [6]:
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

topic_model = BERTopic(verbose=True,
                       calculate_probabilities=True,
                       n_gram_range=(1, 3),
                       language="spanish")
topics, _ = topic_model.fit_transform(docs)

# Preprocess Documents
documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

Batches:   0%|          | 0/137 [00:00<?, ?it/s]

2022-09-26 18:25:39,050 - BERTopic - Transformed documents to Embeddings
2022-09-26 18:26:04,486 - BERTopic - Reduced dimensionality
2022-09-26 18:26:06,208 - BERTopic - Clustered reduced embeddings


In [7]:
coherence

0.633956568177132

https://github.com/MIND-Lab/OCTIS