In [1]:
#nltk.download('stopwords')
#nltk.download('punkt')
#!python -m spacy download es_core_news_md
#!python -m spacy validate

import warnings
import pandas as pd
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None  # default='warn'

from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm.pandas()
import numpy as np

from utils import clean_dataset_basedOn_media
from utils import date_clustering

In [2]:
df = pd.read_csv("data/loslagos-comunas.csv")
df = date_clustering(clean_dataset_basedOn_media(df))
df.isna().any()

date               False
media_outlet       False
url                False
title              False
text               False
content            False
comuna              True
date_clustering    False
dtype: bool

In [3]:
docs = df.content.tolist()
timestamps = df.date.tolist()

print(len(docs),len(timestamps))

29321 29321


# [BERTopic](https://maartengr.github.io/BERTopic/index.html)
BERTopic is a topic modeling technique that leverages transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

In [4]:
#!pip install bertopic

In [5]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

topic_model = BERTopic(language="spanish", verbose=True) 
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/917 [00:00<?, ?it/s]

2022-09-11 19:15:23,886 - BERTopic - Transformed documents to Embeddings
2022-09-11 19:15:59,177 - BERTopic - Reduced dimensionality
2022-09-11 19:16:17,378 - BERTopic - Clustered reduced embeddings


In [6]:
hierarchical_topics = topic_model.hierarchical_topics(docs, topics)

100%|███████████████████████████████████████████████████████████████████████████████| 448/448 [00:03<00:00, 115.44it/s]


### [Linkage fuctions](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html)

In [7]:
from scipy.cluster import hierarchy as sch
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, topics, linkage_function=linkage_function)

100%|████████████████████████████████████████████████████████████████████████████████| 448/448 [00:04<00:00, 91.07it/s]


### Visualizations

In [8]:
fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
fig.write_image("img/hierarchical.png")

<img src="img/hierarchical.png" />


## Topics over time 

In [11]:
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps, nr_bins=20)

20it [01:10,  3.54s/it]


In [21]:
asdf = topics_over_time.loc[topics_over_time.Frequency > 10]
topic_model.visualize_topics_over_time(asdf, top_n_topics=50)

## Evaluations