In [None]:
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm.pandas()

import pandas as pd

In [None]:
from utils import filter_by_media
from utils import cluster_by_month
#from utils import preprocess

df = pd.read_csv("data/loslagos-comunas.csv")
df = cluster_by_month(filter_by_media(df))
#df['tokens'] =  df.content.progress_apply(lambda x: preprocess(str(x)))
df.isna().any()

In [None]:
print(len(df))
df.head(5)

## 1. Modelado de tópicos con BERTopic

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors
nlp2 = KeyedVectors.load_word2vec_format("data/SBW-vectors-300-min5.bin.gz", binary=True) 

#### MODELO HECHO EN LA UCH https://github.com/dccuchile/spanish-word-embeddings

Info sobre el preprocesamiento de bertopic: https://github.com/MaartenGr/BERTopic/issues/40

In [None]:
df['topic'] = ""
docs = df.content.tolist()

In [None]:
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=None)

topic_model = BERTopic(language="multilingual",
                       embedding_model=nlp2,
                       vectorizer_model=vectorizer_model,
                       calculate_probabilities=True,
                       verbose=True)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

In [None]:
labels=topic_model.generate_topic_labels()
count = 0
for doc in docs:  
    df.at[df.index[df['content'] == doc], 'topic'] = labels[topics[count]+1]
    count+=1

In [None]:
df.loc[df['topic'] == labels[75]]

## 2. Análisis de sentimiento 

In [None]:
#!pip install pysentimiento

In [None]:
sub = df.copy()
sub['title_sentiment_roBERTuito'] = ""
sub['title_emotion_roBERTuito'] = ""
sub['title_sentiment_BETO'] = ""
sub['text_sentiment_BETO'] = ""

In [None]:
# roBERTuito
from pysentimiento import create_analyzer
sentiment_analyzer = create_analyzer(task="sentiment", lang="es")
emotion_analyzer = create_analyzer(task="emotion", lang="es")

In [None]:
# BETO
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
model_name = "finiteautomata/beto-sentiment-analysis"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
for index, row in tqdm(sub.iterrows(), desc='sub rows - sentiment', total=sub.shape[0]):
    # análisis del título de la noticia
    sub.at[index, "title_sentiment_roBERTuito"] = sentiment_analyzer.predict(row['title'])
    sub.at[index, "title_emotion_roBERTuito"] = emotion_analyzer.predict(row['title'])
    sub.at[index, 'title_sentiment_BETO'] = nlp(row['title'])
    
    # análisis del cuerpo de la noticia
    count_neutral = 0
    count_negative = 0
    count_positive = 0
    partition = row['text'].split(".")
    for text in partition:
        # Analizamos su sentimiento
        sentiment_value = nlp(text)
        if sentiment_value[0].get('label') == "NEU": count_neutral=count_neutral+1
        if sentiment_value[0].get('label') == "NEG": count_negative=count_negative+1
        if sentiment_value[0].get('label') == "POS": count_positive=count_positive+1
            
    sub.at[index, "text_sentiment_BETO"] = {"NEU": count_neutral, "NEG": count_negative, "POS": count_positive}

In [None]:
pd.set_option("display.max_columns", None, 'display.max_colwidth', None)
sub[['title','title_sentiment_roBERTuito', 'title_emotion_roBERTuito','title_sentiment_BETO',"text_sentiment_BETO"]]