# Código para la lematización, modelado de tópicos y visualización de datos

### Importes Necesarios

In [1]:
#Numpy and Pandas (necesarios para Spacy)
import numpy as np
import pandas as pd

#gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import LdaModel
from gensim.corpora import Dictionary

#NLTK
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

#spacy y stanza
import stanza
import spacy_stanza
nlp = spacy_stanza.load_pipeline("es", disable=['parser','ner'])

#mysql
import pymysql as sql
import pymysql.cursors

#extras
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  from ._conv import register_converters as _register_converters
2024-02-10 11:48:08 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


HBox(children=(HTML(value='Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/res…




2024-02-10 11:48:11 INFO: Loading these models for language: es (Spanish):
| Processor    | Package         |
----------------------------------
| tokenize     | ancora          |
| mwt          | ancora          |
| pos          | ancora_charlm   |
| lemma        | ancora_nocharlm |
| constituency | combined_charlm |
| depparse     | ancora_charlm   |
| sentiment    | tass2020        |
| ner          | conll02         |

2024-02-10 11:48:11 INFO: Using device: cpu
2024-02-10 11:48:11 INFO: Loading: tokenize
  if V(fastcache.__version__) < '0.4.0':
  other = LooseVersion(other)
2024-02-10 11:48:14 INFO: Loading: mwt
2024-02-10 11:48:14 INFO: Loading: pos
2024-02-10 11:48:14 INFO: Loading: lemma
2024-02-10 11:48:14 INFO: Loading: constituency
2024-02-10 11:48:15 INFO: Loading: depparse
2024-02-10 11:48:16 INFO: Loading: sentiment
2024-02-10 11:48:16 INFO: Loading: ner
2024-02-10 11:48:17 INFO: Done loading processors!


## Conexión MySQL

In [2]:
#Conectar a la base de datos en MySQL local
conn = sql.connect(
    host="localhost",
    port=3306,
    user="root",
    passwd="Password1234!",
    db="siglodb_2002-2022")
cursor = conn.cursor()
cursorclass = pymysql.cursors.DictCursor
query = "SELECT titular FROM `siglodb_2002-2022`.siglodb2022 WHERE seccion = 'Internacional';"
cursor.execute(query)
data = cursor.fetchall()
cursor.close()
conn.close()

titulares = [row[0] for row in data]


## Preparación de los datos

In [3]:
#Limpia de manera general los textos (puntuación, minúsculas, etc.)
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(titulares)

In [4]:
# Remueve las palabras de uso común
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


stop_words = stopwords.words('spanish')
stop_words.extend(['cada', 'mil', 'tras'])
data_words_nostops = remove_stopwords(data_words)

In [None]:
# # Reconocimiento de entidades nombradas
# def ner(texts):
#     lista = []
#     for text in texts:
#         ner_doc = nlp(" ".join(text)) 
#         lista.append([{ent.text} for token in ner_doc])
#     return lista

# named_entities = ner(data_words)

# print (named_entities)

In [None]:
#Lematización de los textos limpios
def lemmatization(texts):
    texts_out = []
    i = 0
    for text in texts:
        doc = nlp(" ".join(text)) 
        texts_out.append([token.lemma_ for token in doc])
        print (i)
        i = i+1
    return texts_out

data_lemmatized = lemmatization(data_words_nostops)

In [6]:
# Genera el modelo y ejecuta la función para la creación de bigramas y trigramas
bigram = gensim.models.Phrases(data_lemmatized, min_count=5, threshold=50)
trigram = gensim.models.Phrases(bigram[data_lemmatized], threshold=50)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

data_words_bigrams = make_bigrams(data_lemmatized)
data_words_trigrams = make_trigrams(data_words_bigrams)

In [7]:
#Remueve términos frecuentes y crea BoW
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_words_trigrams)

texts = data_words_trigrams

corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words =[]
words_missing_in_tfidf =[]
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow


## Modelo LDA

In [8]:
#Genera el modelo de modelado de tópicos
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

## Visualización

In [9]:
#Visualización de los datos
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis


  default_term_info = default_term_info.sort_values(
