**6) Przetestuj dowolny algorytm streszczania** (działający dla języka polskiego)

Autor: Slawomir Gorawski

Ref.:
Rada Mihalcea and Paul Tarau. (2004). _TextRank: Bringing Order into Texts._
Retrieved from https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf

In [15]:
import re
import csv
import logging
from pathlib import Path

import nltk
nltk.download('punkt')
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')

[nltk_data] Downloading package punkt to /Users/sgorawski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
DATA_DIR = Path('data')
STOPWORDS_PL_FILE_PATH = DATA_DIR / 'stopwords.pl.txt'
BASE_FORMS_FILE_PATH = DATA_DIR / 'polimorfologik-2.1.txt'
WORD_EMBEDDINGS_FILE_PATH = DATA_DIR / 'nkjp+wiki-forms-all-100-cbow-hs.txt'

In [4]:
with open(STOPWORDS_PL_FILE_PATH) as f:
    STOPWORDS_PL = set(f.read().split())

In [5]:
_BASE_FORMS = {}

with open(BASE_FORMS_FILE_PATH) as f:
    for base_form, word, *_ in csv.reader(f, delimiter=';'):
        _BASE_FORMS[word.lower()] = base_form

def base(word):
    word = word.lower()
    return _BASE_FORMS.get(word, word)

In [131]:
WORD_EMBEDDINGS = {}

with open(WORD_EMBEDDINGS_FILE_PATH) as f:
    lines = iter(f)
    next(lines)
    for i, line in enumerate(lines):
        if i > 0 and i % 100_000 == 0:
            logging.info(f'Saved {i} embeddings')
        word, *args = line.split()
        WORD_EMBEDDINGS[word.lower()] = np.fromiter(args, float, 100)

INFO:root:Saved 100000 embeddings
INFO:root:Saved 200000 embeddings
INFO:root:Saved 300000 embeddings
INFO:root:Saved 400000 embeddings
INFO:root:Saved 500000 embeddings
INFO:root:Saved 600000 embeddings
INFO:root:Saved 700000 embeddings
INFO:root:Saved 800000 embeddings
INFO:root:Saved 900000 embeddings
INFO:root:Saved 1000000 embeddings
INFO:root:Saved 1100000 embeddings
INFO:root:Saved 1200000 embeddings
INFO:root:Saved 1300000 embeddings
INFO:root:Saved 1400000 embeddings
INFO:root:Saved 1500000 embeddings
INFO:root:Saved 1600000 embeddings
INFO:root:Saved 1700000 embeddings
INFO:root:Saved 1800000 embeddings
INFO:root:Saved 1900000 embeddings
INFO:root:Saved 2000000 embeddings
INFO:root:Saved 2100000 embeddings


In [141]:
nonword = re.compile(r'(\.|,|\(|\)|-|;|–|"|„|”)')

def read_article(file, strip=True):
    filedata = file.readlines()
    title = filedata[1]
    article = filedata[3:-1]
    article2 = []
    for par in article:
        article2.extend(tokenizer.tokenize(par))
    sentences = []

    for sentence in article2:
        if strip:
            sentence = nonword.sub(' ', sentence)
        if words := sentence.split():
            sentences.append(words)
    
    return title, sentences

## Sentence similarity calculations

In [89]:
def one_hot_word_vectors_cosine_similarity(sentence1, sentence2):
    base_sentence1 = [base(w) for w in sentence1]
    base_sentence2 = [base(w) for w in sentence2]
 
    all_words = list({*base_sentence1, *base_sentence2})
 
    vector1 = np.zeros(len(all_words))
    vector2 = np.zeros(len(all_words))
 
    for word in base_sentence1:
        if word in STOPWORDS_PL:
            continue
        vector1[all_words.index(word)] = 1
 
    for word in base_sentence2:
        if word in STOPWORDS_PL:
            continue
        vector2[all_words.index(word)] = 1
 
    return 1 - cosine_distance(vector1, vector2)

In [150]:
def normalized_words_overlap_similarity(sentence1, sentence2):
    base_sentence1 = {base(w) for w in sentence1} - STOPWORDS_PL
    base_sentence2 = {base(w) for w in sentence2} - STOPWORDS_PL
    return len(base_sentence1 & base_sentence2) / (np.log(len(sentence1)) + np.log(len(sentence2)) + 1)

In [125]:
def embeddings_mean_cosine_similarity(sentence1, sentence2):
    embeddings1 = [WORD_EMBEDDINGS.get(word) for word in sentence1]
    embeddings2 = [WORD_EMBEDDINGS.get(word) for word in sentence2]
    
    embeddings1 = np.array([v for v in embeddings1 if v is not None])
    embeddings2 = np.array([v for v in embeddings2 if v is not None])
    
    if len(embeddings1) == 0 or len(embeddings2) == 0:
        return 0
    
    mean1 = np.sum(embeddings1, axis=0) / len(embeddings1)
    mean2 = np.sum(embeddings2, axis=0) / len(embeddings2)
    
    return 1 - cosine_distance(mean1, mean2)

In [168]:
sentence_similarity = embeddings_mean_cosine_similarity

In [169]:
def build_similarity_matrix(sentences):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for i, sentence1 in enumerate(sentences):
        for j, sentence2 in enumerate(sentences):
            if i == j:
                continue 
            similarity_matrix[i][j] = sentence_similarity(sentence1, sentence2)

    return similarity_matrix

In [170]:
def generate_summary(sentences, top_n=5):
    sentence_similarity_martix = build_similarity_matrix(sentences)
    scores = nx.pagerank(nx.from_numpy_array(sentence_similarity_martix), max_iter=500)
    ranked_sentences = sorted(sorted(range(len(sentences)), key=lambda i: scores[i], reverse=True)[:top_n])
    return [sentences[i] for i in ranked_sentences], ranked_sentences

In [171]:
def highlight(text):
    return f'\033[1m\033[36m{text}\033[m'

def display_text(sentences, to_highlight=None):
    to_highlight = to_highlight or ()
    sentences_str = []
    for i, sentence in enumerate(sentences):
        sentence_str = ' '.join(sentence)
        if i in to_highlight:
            sentence_str = highlight(sentence_str)
        sentences_str.append(sentence_str)
    return ' '.join(sentences_str)

In [172]:
with open('selected-articles.txt') as f:
    filenames = f.read().split()
for index in range(20):
    with open(DATA_DIR / filenames[index]) as f:
        title, article = read_article(f, strip=False)
    try:
        summary, ranked = generate_summary(article, top_n=10)
    except nx.PowerIterationFailedConvergence as e:
        logging.error(title)
        continue
    print(
        title,
        display_text(article, ranked),
        '=====',
        title,
        display_text(summary),
        '==========',
        '==========',
        sep='\n\n',
        end='\n\n',
    )

Ruch Rodzin Nazaretańskich


[1m[36mRuch Rodzin Nazaretańskich (RRN) – maryjny ruch w Kościele katolickim, założony w 1985 r. w Warszawie przez ks. Tadeusza Dajczera.[m Obecnie istnieje w ok. 40 krajach świata. Należy do niego 45 tysięcy osób (2007), a w archidiecezji warszawskiej, skąd się wywodzi, ok. 3 tysięcy. Posiada sekcje dla dzieci, młodzieży, studentów i dorosłych. Działa przy parafiach. Historia. Powstanie RRN. [1m[36mGenezy powstania RRN-u można dopatrywać się w trzydziestoletnim doświadczeniu duchowym ks. Tadeusza Dajczera, jego nieustannym poszukiwaniu kierownictwa duchowego (spowiedź u mistyka i późniejszego świętego katolickiego Pio z Pietrelciny), zestawieniu sekularyzacji społeczeństw zachodnich i istnienia większego otwarcia na "sacrum" w cywilizacjach Wschodu ("żyznej gleby" pomimo nie w pełni dobrego "ziarna"), a także w jego studiach religioznawczych nad buddyzmem, czy twórczością M. Eliade).[m W wywiadzie z Dorotą Narewską ks. Dajczer wspomina o spotkaniu z 