In [1]:
# prevent huge warning messages of bertmodel 
import warnings
warnings.filterwarnings("ignore") 

from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm.pandas()

import pandas as pd

import sys
sys.path.append('scripts/')

with open('API_KEY.txt') as f:
    API_KEY = f.readlines()

About [preprocessing]( https://github.com/MaartenGr/BERTopic/issues/40), in words of Maarten Grootendorst, author of BERTopic:


_"In general, no, you do not need to preprocess your data. Like you said, keeping the original structure of the text is especially important for transformer-based models to understand the context._

_However, there are exceptions to this. For example, if you were to have scraped documents with a lot of html tags, then it might be beneficial to remove those as they do not provide any interesting context."_

In [2]:
from preprocess import filter_by_media
from preprocess import cluster_by_month
from preprocess import find_cities

df = pd.read_csv("data/loslagos-comunas.csv")[:1000]
df = cluster_by_month(filter_by_media(df))
df = df.drop_duplicates(subset='content', keep="first")
df.drop(columns=['comuna'], axis=1, inplace=True)
df['cities'] =  df.content.progress_apply(lambda x: find_cities(str(x)))
docs = df.content.tolist()

print("number of news:", len(df))

100%|██████████████████████████████████████████████████████████████████████████████| 878/878 [00:00<00:00, 1003.50it/s]

number of news: 878





### "Document Clustering" with [BERTopic](https://github.com/MaartenGr/BERTopic) (+[SentenceTransformer](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) +[Word Embeddings](https://github.com/dccuchile/spanish-word-embeddings)) & Topic Representation with [Chat-gpt gpt-3.5-turbo	](https://platform.openai.com/docs/models/gpt-3) + [Maximal Marginal Relevance](https://medium.com/tech-that-works/maximal-marginal-relevance-to-rerank-results-in-unsupervised-keyphrase-extraction-22d95015c7c5)

In [3]:
from gensim.models import KeyedVectors
from bertopic.backend import WordDocEmbedder
from sentence_transformers import SentenceTransformer

ft  = KeyedVectors.load_word2vec_format("data/SBW-vectors-300-min5.bin.gz", binary=True) 
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
word_doc_embedder = WordDocEmbedder(embedding_model=embedding_model, word_embedding_model=ft)

In [14]:
import openai
from bertopic import BERTopic
from bertopic.representation import OpenAI, MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN


vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=None)
    
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=10, 
                        metric='euclidean', 
                        cluster_selection_method='eom', 
                        prediction_data=True, 
                        min_samples=5)

openai.api_key = API_KEY[0]

# Create your representation model
prompt = """I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract in spanish a short topic label in the following format:
topic: <topic label>
"""

openai_generator = OpenAI(model="gpt-3.5-turbo", prompt=prompt, delay_in_seconds=15, chat=True) #chatgtp
#openai_generator = OpenAI(prompt=prompt, delay_in_seconds=10)

mmr = MaximalMarginalRelevance(diversity=0.3)
representation_models = [mmr, openai_generator]

topic_model = BERTopic(n_gram_range=(1,3),
                       top_n_words=15, 
                      # nr_topics=50, #prevent RateLimitErrors
                       embedding_model=word_doc_embedder,
                       language="multilingual", 
                       vectorizer_model=vectorizer_model,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       calculate_probabilities=True,
                       verbose=True,
                       representation_model=representation_models)

In [15]:
topic_model.get_params()

{'calculate_probabilities': True,
 'ctfidf_model': ClassTfidfTransformer(),
 'embedding_model': <bertopic.backend._word_doc.WordDocEmbedder at 0x1aa9dec90c8>,
 'hdbscan_model': HDBSCAN(min_cluster_size=10, min_samples=5, prediction_data=True),
 'language': None,
 'low_memory': False,
 'min_topic_size': 10,
 'n_gram_range': (1, 3),
 'nr_topics': None,
 'representation_model': [MaximalMarginalRelevance(diversity=0.3),
  OpenAI(chat=True, delay_in_seconds=15, model='gpt-3.5-turbo',
         prompt='I have a topic that contains the following documents: \n'
                '[DOCUMENTS]\n'
                'The topic is described by the following keywords: [KEYWORDS]\n'
                '\n'
                'Based on the information above, extract in spanish a short '
                'topic label in the following format:\n'
                'topic: <topic label>\n')],
 'seed_topic_list': None,
 'top_n_words': 15,
 'umap_model': UMAP(metric='cosine', min_dist=0.0, n_components=5, random_state=42

In [17]:
topics, probs = topic_model.fit_transform(docs)

clusters = topic_model.get_topic_info()

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

2023-03-07 17:35:47,075 - BERTopic - Transformed documents to Embeddings
2023-03-07 17:35:49,524 - BERTopic - Reduced dimensionality
2023-03-07 17:35:49,580 - BERTopic - Clustered reduced embeddings


APIError: The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID b1e47ec718812e62653e4b32126d5453 in your email.) {
  "error": {
    "message": "The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID b1e47ec718812e62653e4b32126d5453 in your email.)",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID b1e47ec718812e62653e4b32126d5453 in your email.)', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Tue, 07 Mar 2023 20:36:10 GMT', 'Content-Type': 'application/json', 'Content-Length': '366', 'Connection': 'keep-alive', 'Access-Control-Allow-Origin': '*', 'Openai-Model': 'gpt-3.5-turbo-0301', 'Openai-Organization': 'user-s2u1conyucfknmjysvf1mezm', 'Openai-Processing-Ms': '844', 'Openai-Version': '2020-10-01', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains', 'X-Request-Id': 'b1e47ec718812e62653e4b32126d5453'}

We generate a dataframe with the obtained clusters and extract their most significant tokens.

In [None]:
clusters['most_freq_tokens'] = clusters.Topic.progress_apply(lambda x: topic_model.get_topic(x))
clusters

In [None]:
topic_model.visualize_topics()

In [None]:
#clusters.to_csv('data/clusters.csv', index=False)

We label the news with their clusters.

In [None]:
df['topic_name'] = ""
df['topic_number'] = ""

# label each row with his topic
labels=[]
for item in topic_model.generate_topic_labels():
    item.partition("_")[2]
    labels.append(item)

count = 0
for doc in tqdm(docs):  
    df.at[df.index[df['content'] == doc], 'topic_name'] = labels[topics[count]+1]
    df.at[df.index[df['content'] == doc], 'topic_number'] = topics[count]
    count+=1
    
df.head(4)

In [None]:
#df.to_csv('data/labeled_news.csv', index=False)

#### Evaluation: Coherence Score

There is no one way to determine whether the coherence score is good or bad. The score and its value depends on the data that it's calculated from. For instance, in one case, the score of 0.5 might be good enough but in another case not acceptable. The only rule is that we want to **maximize** the score.

Usually, the coherence score will increase with the number of topics . This increase will become smaller as the number of topics get higher. The trade-off between the number of topics and coherence score can be achieved using the so-called elbow technique. The method implies plotting coherence score as a function of number of topics. We use the elbow of the curve to select the number of topics.

The idea behind this method is that we want to choose a point after which the diminishing increase of coherence score is no longer worth the additional increase of number of topics.

In [None]:
from coherence_eval import umass_npmi

umass_coherence, c_npmi_coherence = umass_npmi(docs, topics, topic_model)

In [None]:
umass_coherence, c_npmi_coherence

#### Hierarchical clustering

In [None]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

In [None]:
pd.set_option("display.max_columns", 20, 'display.max_colwidth', 50)
hierarchical_topics.head(4)

In [None]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

#### Topics over time

In [None]:
timestamps = df.date.tolist()
topics_over_time = topic_model.topics_over_time(docs=docs, 
                                                timestamps=timestamps, 
                                                global_tuning=False, 
                                                evolution_tuning=False, 
                                                nr_bins=20)

In [None]:
topics_over_time.head(4)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time)

### 5 _most important_ keywords of documents using [KeyBERT](https://github.com/MaartenGr/KeyBERT) (+[Word Embeddings](https://github.com/dccuchile/spanish-word-embeddings))

In [None]:
from keyword_extraction import extract_ngram_keywords

df['2gram_keywords'] = extract_ngram_keywords((2,2), ft, docs)
df['3gram_keywords'] = extract_ngram_keywords((3,3), ft, docs)

### Sentiment Analysis using [BETO](https://huggingface.co/finiteautomata/beto-sentiment-analysis?text=Te+quiero.+Te+amo.) + Sentiment Analysis/Emotion Analysis using [roBERTuito](https://huggingface.co/pysentimiento/robertuito-sentiment-analysis?text=Te+quiero.+Te+amo.)

In [None]:
# BETO
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
model_name = "finiteautomata/beto-sentiment-analysis"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
# roBERTuito
from pysentimiento import create_analyzer
sentiment_analyzer = create_analyzer(task="sentiment", lang="es")

In [None]:
from sentiment_analysis import sentiment_analysis
df = sentiment_analysis(df, sentiment_analyzer, nlp)

### Manual classification of clusters according to areas of interest.

In [None]:
clusters

In [None]:
import ast
def tokens_to_list(text):
    text = text[1:-1]
    res = ast.literal_eval(text)
    return list(dict(res).keys())


clusters = clusters[1:]
clusters= clusters.set_index('Name')

clusters['tokens'] = clusters.most_freq_tokens.apply(lambda x: str(tokens_to_list("{"+str(x)[1:-1]+"}")))

In [None]:
health = clusters[clusters['tokens'].str.contains('salud|cáncer')]

In [None]:
health_news = pd.DataFrame()
for index, rows in health.iterrows():
    health_news = pd.concat([health_news, df[df.topic_number == rows['Topic']]])
health_news

In [None]:
tex

list(sentiment_analyzer.predict(row['title']).probas.keys())[0]