# Procesamiento del Lenguaje Natural

Rodrigo S. Cortez Madrigal

<img src="https://pcic.posgrado.unam.mx/wp-content/uploads/Ciencia-e-Ingenieria-de-la-Computacion_color.png" alt="Logo PCIC" width="128" />  

T15 - Actividad Sumativa 1

In [80]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import markdown
from IPython.display import display, HTML
import nltk

In [81]:
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/roicort/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/roicort/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [82]:

class FrequencySummarizer:
  def __init__(self, min_cut=0.1, max_cut=0.9):
    """
     Initilize the text summarizer.
     Words that have a frequency term lower than min_cut 
     or higer than max_cut will be ignored.
    """
    self._min_cut = min_cut
    self._max_cut = max_cut 
    self._stopwords = set(stopwords.words('spanish') + list(punctuation))

  def _compute_frequencies(self, word_sent):
      """ 
        Compute the frequency of each of word.
        Input: 
        word_sent, a list of sentences already tokenized.
        Output: 
        freq, a dictionary where freq[w] is the frequency of w.
      """
      freq = defaultdict(int)
      for s in word_sent:
          for word in s:
              if word not in self._stopwords:
                  freq[word] += 1
      # frequencies normalization and filtering
      m = float(max(freq.values()))
      to_remove = []
      for w in freq.keys():
          freq[w] = freq[w]/m
          if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
              to_remove.append(w)
      for w in to_remove:
          del freq[w]
      return freq

  def summarize(self, text, n):
    """
      Return a list of n sentences 
      which represent the summary of text.
    """
    sents = sent_tokenize(text)
    assert n <= len(sents)
    word_sent = [word_tokenize(s.lower()) for s in sents]
    self._freq = self._compute_frequencies(word_sent)
    ranking = defaultdict(int)
    for i,sent in enumerate(word_sent):
      for w in sent:
        if w in self._freq:
          ranking[i] += self._freq[w]
    sents_idx = self._rank(ranking, n)    
    return [sents[j] for j in sents_idx]

  def _rank(self, ranking, n):
    """ return the first n sentences with highest ranking """
    return nlargest(n, ranking, key=ranking.get)

In [83]:
import urllib
from bs4 import BeautifulSoup

def get_only_text(url):
    """ 
    return the title and the text of the article
    at the specified url
    """
    page = urllib.request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page)
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    return soup.title.text, text

In [84]:
import urllib.request
from bs4 import BeautifulSoup

feed_xml = urllib.request.urlopen('https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/section/mexico/portada').read()
feed = BeautifulSoup(feed_xml.decode('utf8'), 'lxml-xml')

to_summarize = map(lambda p: p.text, feed.find_all('guid'))

In [85]:
articles = list(to_summarize)
articles[:5]

['https://elpais.com/internacional/2025-05-19/trump-y-putin-hablan-durante-dos-horas-para-revitalizar-las-negociaciones-de-paz-sobre-ucrania.html',
 'https://elpais.com/us/2025-05-19/estados-unidos-abre-una-nueva-y-contradictoria-etapa-en-su-lucha-contra-el-narcotrafico-mexicano.html',
 'https://elpais.com/mexico/2025-05-19/los-maestros-de-la-cnte-aumentan-la-presion-contra-el-gobierno-y-toman-las-principales-casetas-de-entrada-a-la-capital.html',
 'https://elpais.com/mexico/2025-05-19/sheinbaum-afirma-que-su-gobierno-no-impulsa-por-el-momento-la-reforma-sobre-las-fiscalias.html',
 'https://elpais.com/mexico/2025-05-19/al-menos-ocho-muertos-y-mas-de-una-decena-de-heridos-la-huella-del-caos-tras-el-acamoto-2025-el-festival-motoquero-en-acapulco.html']

In [88]:
fs = FrequencySummarizer()
output_lines = ['# Resumenes de El País\n']

for article_url in articles[:5]:
    title, text = get_only_text(article_url)
    output_lines.append('## ' + title.strip())
    summary = fs.summarize(text, 2)
    for s in summary:
        output_lines.append('- ' + s.strip())
    output_lines.append('\n')

# Guardar en archivo Markdown
with open('resumenes.md', 'w', encoding='utf-8') as f:
    f.write('\n'.join(output_lines))

In [89]:
with open('resumenes.md', 'r', encoding='utf-8') as f:
    markdown_text = f.read()

html = markdown.markdown(markdown_text)
display(HTML(html))