In [1]:
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm.pandas()

import pandas as pd


In [None]:
from utils import filter_by_media
from utils import cluster_by_month
from utils import preprocess


df = pd.read_csv("data/loslagos-comunas.csv")[:100]
df = cluster_by_month(filter_by_media(df))
df['tokens'] =  df.content.progress_apply(lambda x: preprocess(str(x)))
df.isna().any()

## 1. Modelado de tópicos con BERTopic

In [16]:
docs = df.content.tolist()

In [None]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

In [20]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(analyzer = 'word',
                                   tokenizer = tokenize,
                                   lowercase = True,
                                   stop_words = spanish_stopwords)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  diversity=0.5                       # Step 6 - Diversify topic words
)

topics, probs = topic_model.fit_transform(docs)

ModuleNotFoundError: No module named 'bertopic.vectorizers'

## 2. Análisis de sentimiento 

In [None]:
#!pip install pysentimiento

In [5]:
sub = df.copy()
sub['title_sentiment_roBERTuito'] = ""
sub['title_emotion_roBERTuito'] = ""
sub['title_sentiment_BETO'] = ""
sub['text_sentiment_BETO'] = ""

In [None]:
# roBERTuito
from pysentimiento import create_analyzer
sentiment_analyzer = create_analyzer(task="sentiment", lang="es")
emotion_analyzer = create_analyzer(task="emotion", lang="es")

In [None]:
# BETO
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
model_name = "finiteautomata/beto-sentiment-analysis"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [14]:
for index, row in tqdm(sub.iterrows(), desc='sub rows - sentiment', total=sub.shape[0]):
    # análisis del título de la noticia
    sub.at[index, "title_sentiment_roBERTuito"] = sentiment_analyzer.predict(row['title'])
    sub.at[index, "title_emotion_roBERTuito"] = emotion_analyzer.predict(row['title'])
    sub.at[index, 'title_sentiment_BETO'] = nlp(row['title'])
    
    # análisis del cuerpo de la noticia
    count_neutral = 0
    count_negative = 0
    count_positive = 0
    partition = row['text'].split(".")
    for text in partition:
        # Analizamos su sentimiento
        sentiment_value = nlp(text)
        if sentiment_value[0].get('label') == "NEU": count_neutral=count_neutral+1
        if sentiment_value[0].get('label') == "NEG": count_negative=count_negative+1
        if sentiment_value[0].get('label') == "POS": count_positive=count_positive+1
            
    sub.at[index, "text_sentiment_BETO"] = {"NEU": count_neutral, "NEG": count_negative, "POS": count_positive}

sub rows - sentiment: 100%|██████████████████████████████████████████████████████████| 100/100 [01:55<00:00,  1.15s/it]


In [15]:
pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)
sub[['title','title_sentiment_roBERTuito', 'title_emotion_roBERTuito','title_sentiment_BETO',"text_sentiment_BETO"]]

Unnamed: 0,title,title_sentiment_roBERTuito,title_emotion_roBERTuito,title_sentiment_BETO,text_sentiment_BETO
0,Reconocen a guardaparques de la Región de Los Lagos como actores claves en la conservación,"AnalyzerOutput(output=NEU, probas={NEU: 0.627, POS: 0.355, NEG: 0.018})","AnalyzerOutput(output=others, probas={others: 0.855, joy: 0.103, surprise: 0.014, sadness: 0.009, anger: 0.009, disgust: 0.005, fear: 0.005})","[{'label': 'POS', 'score': 0.9539938569068909}]","{'NEU': 13, 'NEG': 0, 'POS': 14}"
1,Con nuevos materiales comienza plan piloto en Saltos del Petrohué,"AnalyzerOutput(output=NEU, probas={NEU: 0.853, POS: 0.136, NEG: 0.010})","AnalyzerOutput(output=others, probas={others: 0.951, joy: 0.021, surprise: 0.013, fear: 0.005, sadness: 0.004, anger: 0.003, disgust: 0.002})","[{'label': 'NEU', 'score': 0.9277083277702332}]","{'NEU': 3, 'NEG': 0, 'POS': 6}"
2,Centro de Salud Familiar CESFAM Puerto Varas invita a prevenir el Cáncer Cervicouterino con extensión horaria para exámenes PAP,"AnalyzerOutput(output=NEU, probas={NEU: 0.954, POS: 0.026, NEG: 0.020})","AnalyzerOutput(output=others, probas={others: 0.940, joy: 0.030, anger: 0.008, surprise: 0.008, sadness: 0.007, fear: 0.004, disgust: 0.003})","[{'label': 'NEU', 'score': 0.9948523640632629}]","{'NEU': 14, 'NEG': 0, 'POS': 0}"
3,Alcalde Tomás Gárate presidió por primera vez la octava sesión del Consejo Comunal de la Sociedad Civil COSOC Puerto Varas,"AnalyzerOutput(output=NEU, probas={NEU: 0.916, POS: 0.052, NEG: 0.032})","AnalyzerOutput(output=others, probas={others: 0.933, joy: 0.048, surprise: 0.008, anger: 0.004, sadness: 0.003, fear: 0.002, disgust: 0.002})","[{'label': 'NEU', 'score': 0.9440039396286011}]","{'NEU': 8, 'NEG': 0, 'POS': 0}"
4,Galería de Arte Machacoya realizará remate de obras de artistas de la zona,"AnalyzerOutput(output=NEU, probas={NEU: 0.831, POS: 0.158, NEG: 0.011})","AnalyzerOutput(output=others, probas={others: 0.940, joy: 0.034, sadness: 0.009, surprise: 0.008, fear: 0.004, anger: 0.004, disgust: 0.002})","[{'label': 'NEU', 'score': 0.9955322742462158}]","{'NEU': 3, 'NEG': 0, 'POS': 4}"
5,Municipio llamará a licitación construcción de pasarela peatonal en Los Notros,"AnalyzerOutput(output=NEU, probas={NEU: 0.947, POS: 0.037, NEG: 0.016})","AnalyzerOutput(output=others, probas={others: 0.948, joy: 0.017, surprise: 0.016, sadness: 0.006, fear: 0.005, anger: 0.005, disgust: 0.003})","[{'label': 'NEU', 'score': 0.9952103495597839}]","{'NEU': 5, 'NEG': 0, 'POS': 1}"
6,3era Mesa de Reactivación Económica circunscribe su radio de acción,"AnalyzerOutput(output=NEU, probas={NEU: 0.978, NEG: 0.012, POS: 0.010})","AnalyzerOutput(output=others, probas={others: 0.979, surprise: 0.008, joy: 0.007, fear: 0.002, sadness: 0.001, anger: 0.001, disgust: 0.001})","[{'label': 'NEU', 'score': 0.9724546670913696}]","{'NEU': 12, 'NEG': 0, 'POS': 1}"
7,Gremio médico rechaza cierre de camas críticas implementadas en Hospital Ancud,"AnalyzerOutput(output=NEU, probas={NEU: 0.878, NEG: 0.113, POS: 0.009})","AnalyzerOutput(output=others, probas={others: 0.949, sadness: 0.016, surprise: 0.015, anger: 0.007, fear: 0.005, disgust: 0.004, joy: 0.004})","[{'label': 'NEU', 'score': 0.9739839434623718}]","{'NEU': 2, 'NEG': 2, 'POS': 1}"
8,Asistentes de educación municipal en Osorno verán incrementadas sus remuneraciones,"AnalyzerOutput(output=NEU, probas={NEU: 0.929, NEG: 0.062, POS: 0.009})","AnalyzerOutput(output=others, probas={others: 0.954, surprise: 0.014, sadness: 0.013, anger: 0.007, fear: 0.005, joy: 0.005, disgust: 0.003})","[{'label': 'NEU', 'score': 0.9927809834480286}]","{'NEU': 4, 'NEG': 0, 'POS': 4}"
9,CONADI llama a renovar directivas de comunidades indígenas,"AnalyzerOutput(output=NEU, probas={NEU: 0.916, NEG: 0.060, POS: 0.024})","AnalyzerOutput(output=others, probas={others: 0.971, surprise: 0.007, joy: 0.006, anger: 0.005, sadness: 0.005, fear: 0.003, disgust: 0.002})","[{'label': 'NEU', 'score': 0.9907516241073608}]","{'NEU': 11, 'NEG': 0, 'POS': 0}"
