In [1]:
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm.pandas()

import pandas as pd

In [2]:
from utils import filter_by_media
from utils import cluster_by_month
#from utils import preprocess

df = pd.read_csv("data/loslagos-comunas.csv")
df = cluster_by_month(filter_by_media(df))
df = df.drop_duplicates(subset='content', keep="first")

#df['tokens'] =  df.content.progress_apply(lambda x: preprocess(str(x))) # Es computacionalmente costoso :(
df.isna().any()

date               False
media_outlet       False
url                False
title              False
text               False
content            False
comuna              True
date_clustering    False
dtype: bool

In [4]:
print(len(df))
df.head(5)

26127


Unnamed: 0,date,media_outlet,url,title,text,content,comuna,date_clustering
0,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/reconocen-a-g...,Reconocen a guardaparques de la Región de Los ...,Distintos protagonistas de los parques naciona...,reconocen guardaparques región lagos actores c...,"['puyehue', 'chaiten']",2021-10
1,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/con-nuevos-ma...,Con nuevos materiales comienza plan piloto en ...,Centro de negocios Sercotec coordina acuerdos ...,nuevos materiales comienza plan piloto saltos ...,['puerto varas'],2021-10
2,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/centro-de-sal...,Centro de Salud Familiar CESFAM Puerto Varas i...,Las horas se solicitan en el SOME o bien a tra...,centro salud familiar cesfam puerto varas invi...,['puerto varas'],2021-10
3,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/alcalde-tomas...,Alcalde Tomás Gárate presidió por primera vez ...,Los y las consejeras destacaron el hecho de vo...,alcalde tomás gárate presidió primera vez octa...,"['castro', 'puerto varas']",2021-10
4,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/galeria-de-ar...,Galería de Arte Machacoya realizará remate de ...,"Hoy viernes a las 18:30 horas, en Machacoya At...",galería arte machacoya realizará remate obras ...,,2021-10


## 1. Modelado de tópicos con BERTopic

In [5]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors
from umap import UMAP
from hdbscan import HDBSCAN

nlp2 = KeyedVectors.load_word2vec_format("data/SBW-vectors-300-min5.bin.gz", binary=True) 

**Modelo utilizando (UChile):** https://github.com/dccuchile/spanish-word-embeddings

Info sobre el preprocesamiento de bertopic: https://github.com/MaartenGr/BERTopic/issues/40

In [6]:
df['topic'] = ""
docs = df.content.tolist()

In [7]:
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=None)

umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', 
                        cluster_selection_method='eom', prediction_data=True, min_samples=5)

topic_model = BERTopic(language="multilingual",
                       embedding_model=nlp2,
                       vectorizer_model=vectorizer_model,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       calculate_probabilities=True,
                       verbose=True,
                       diversity=0.6)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

100%|██████████████████████████████████████████████████████████████████████████| 26127/26127 [00:11<00:00, 2349.86it/s]
2022-10-27 01:50:44,304 - BERTopic - Transformed documents to Embeddings
2022-10-27 01:51:26,187 - BERTopic - Reduced dimensionality
2022-10-27 01:57:05,435 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,8831,-1_chile_personas_parte_país
1,0,1258,0_carabineros_robo_fiscal_años
2,1,319,1_samu_accidente tránsito_lugar_carabineros
3,2,226,2_cocaína_tráfico_detectives_sativa
4,3,203,3_blanco 324 of_bomberos132 carabineros133_bom...
...,...,...,...
507,506,10,506_horas_locales votación_servel_lentitud
508,507,10,507_sernapesca_caleta bay_calbuco_ejemplares
509,508,10,508_biodiversidad_pumalín_labor_conaf
510,509,10,509_quehui_sabingo_programa sabingo_chilevisión


In [8]:
#topic_model.save("out/save1", save_embedding_model=False)

In [10]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

fig=topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
fig.write_image("img/htopics.png")

100%|████████████████████████████████████████████████████████████████████████████████| 510/510 [02:17<00:00,  3.70it/s]


In [11]:
fig

<img src="https://github.com/rickiwasho/proyecto-titulo/blob/main/img/htopics.png?raw=true">

In [None]:
#BERTopic.load("out/save1", embedding_model=nlp2

In [None]:
labels=topic_model.generate_topic_labels()
count = 0
for doc in docs:  
    df.at[df.index[df['content'] == doc], 'topic'] = labels[topics[count]+1]
    count+=1

In [None]:
df.loc[df['topic'] == labels[75]]

## 2. Análisis de sentimiento 

In [None]:
#!pip install pysentimiento

In [None]:
sub = df.copy()
sub['title_sentiment_roBERTuito'] = ""
sub['title_emotion_roBERTuito'] = ""
sub['title_sentiment_BETO'] = ""
sub['text_sentiment_BETO'] = ""

In [None]:
# roBERTuito
from pysentimiento import create_analyzer
sentiment_analyzer = create_analyzer(task="sentiment", lang="es")
emotion_analyzer = create_analyzer(task="emotion", lang="es")

In [None]:
# BETO
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
model_name = "finiteautomata/beto-sentiment-analysis"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
for index, row in tqdm(sub.iterrows(), desc='sub rows - sentiment', total=sub.shape[0]):
    # análisis del título de la noticia
    sub.at[index, "title_sentiment_roBERTuito"] = sentiment_analyzer.predict(row['title'])
    sub.at[index, "title_emotion_roBERTuito"] = emotion_analyzer.predict(row['title'])
    sub.at[index, 'title_sentiment_BETO'] = nlp(row['title'])
    
    # análisis del cuerpo de la noticia
    count_neutral = 0
    count_negative = 0
    count_positive = 0
    partition = row['text'].split(".")
    for text in partition:
        # Analizamos su sentimiento
        sentiment_value = nlp(text)
        if sentiment_value[0].get('label') == "NEU": count_neutral=count_neutral+1
        if sentiment_value[0].get('label') == "NEG": count_negative=count_negative+1
        if sentiment_value[0].get('label') == "POS": count_positive=count_positive+1
            
    sub.at[index, "text_sentiment_BETO"] = {"NEU": count_neutral, "NEG": count_negative, "POS": count_positive}

In [None]:
pd.set_option("display.max_columns", None, 'display.max_colwidth', None)
sub[['title','title_sentiment_roBERTuito', 'title_emotion_roBERTuito','title_sentiment_BETO',"text_sentiment_BETO"]]