In [206]:
from Sum_module.file_reader import FileReader
from Sum_module.parse_doc import ParseDoc
from Sum_module.preprocess import Preprocessor
from Sum_module.connections import ConnectionMatrix
from Sum_module.pagerank import PageRankCalculator
from Sum_module.summarizer import Summarizer
from Sum_module.output_writer import OutputWriter
from Sum_module.evaluation import Evaluator

import json
import os

In [207]:
filename = "d116i"
input_file_path = f"data/DUC_TEXT/test/{filename}"
preferences_file_path = f"data/DUC_SUM/{filename}"

print(f"Processing file: {input_file_path}")

Processing file: data/DUC_TEXT/test/d116i


In [208]:
doc_file = FileReader(input_file_path).read_file()
doc_file

'<s docid="AP900422-0032" num="9" wdcount="26"> Discovery\'s five astronauts returned Sunday for a second attempt to launch the shuttle with NASA\'s most valuable and celebrated payload, the $1.5 billion Hubble Space Telescope.</s>\n<s docid="AP900422-0032" num="10" wdcount="11"> Discovery is scheduled to lift off at 8:31 a.m. EDT Tuesday.</s>\n<s docid="AP900422-0032" num="11" wdcount="22"> ``We feel very confident that things are going to go well this time,\'\' said Discovery\'s commander, Air Force Col. Loren J. Shriver.</s>\n<s docid="AP900422-0032" num="12" wdcount="19"> ``We\'re going to come out OK on Tuesday morning and, if not, we\'ll just keep trying until we do.</s>\n<s docid="AP900422-0032" num="13" wdcount="11"> That\'s kind of the name of the game here,\'\' Shriver said.</s>\n<s docid="AP900422-0032" num="14" wdcount="22"> NASA test director Mike Leinbach said Sunday that the countdown was proceeding smoothly and the shuttle appeared to be in perfect condition.</s>\n<s do

In [209]:
sentences_dict = ParseDoc.parse_doc(doc_file)
sentences_dict

{0: {'doc_id': 'AP900422-0032',
  'num': '9',
  'wdcount': 26,
  'sentence_text': "Discovery's five astronauts returned Sunday for a second attempt to launch the shuttle with NASA's most valuable and celebrated payload, the $1.5 billion Hubble Space Telescope."},
 1: {'doc_id': 'AP900422-0032',
  'num': '10',
  'wdcount': 11,
  'sentence_text': 'Discovery is scheduled to lift off at 8:31 a.m. EDT Tuesday.'},
 2: {'doc_id': 'AP900422-0032',
  'num': '11',
  'wdcount': 22,
  'sentence_text': "``We feel very confident that things are going to go well this time,'' said Discovery's commander, Air Force Col. Loren J. Shriver."},
 3: {'doc_id': 'AP900422-0032',
  'num': '12',
  'wdcount': 19,
  'sentence_text': "``We're going to come out OK on Tuesday morning and, if not, we'll just keep trying until we do."},
 4: {'doc_id': 'AP900422-0032',
  'num': '13',
  'wdcount': 11,
  'sentence_text': "That's kind of the name of the game here,'' Shriver said."},
 5: {'doc_id': 'AP900422-0032',
  'num':

In [210]:
preprocessor = Preprocessor(use_lemmatizer=True, language="english")
preprocessed_sentences_dict = preprocessor.preprocess_dict(sentences_dict)
preprocessed_sentences_dict

{0: 'discovery five astronaut returned sunday second attempt launch shuttle nasa valuable celebrated payload 15 billion hubble space telescope',
 1: 'discovery scheduled lift 831 edt tuesday',
 2: 'feel confident thing going go well time said discovery commander air force col loren j shriver',
 3: 'going come ok tuesday morning well keep trying',
 4: 'thats kind name game shriver said',
 5: 'nasa test director mike leinbach said sunday countdown proceeding smoothly shuttle appeared perfect condition',
 6: 'countdown got way saturday afternoon',
 7: 'faulty power unit forced first launch attempt scrubbed four minute liftoff april 10',
 8: 'unit replaced new one test showed fine',
 9: 'hopefully well get shuttle pad time leinbach said',
 10: '70 percent chance favorable weather expected launch time low cloud main concern said air force ed priselac shuttle weather officer',
 11: 'weak cold front north expected pas area monday night',
 12: 'outlook considerably better wednesday thursday pr

In [211]:
connection_matrix = ConnectionMatrix(
    sentences = list(preprocessed_sentences_dict.values()),
    min_common_words = 6,
    max_common_words= 1000,
).create_matrix()
connection_matrix

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]], shape=(337, 337))

In [212]:
connection_matrix_test = [
                      [0, 1, 1, 1, 1, 0],
                      [1, 0, 1, 1, 1, 0],
                      [1, 1, 0, 1, 0, 0],
                      [1, 1, 1, 0, 0, 1],
                      [1, 1, 0, 0, 0, 1],
                      [0, 0, 0, 1, 1, 0]
                    ]
pagerank_sores = PageRankCalculator(connection_matrix_test, max_iterations=1).calculator()
pagerank_sores

array([1.01666667, 1.01666667, 0.6625    , 1.15833333, 0.875     ,
       0.52083333])

In [213]:
pagerank_calculator = PageRankCalculator(
    connection_matrix=connection_matrix,
    max_iterations=100,
    tolerance=1e-6
)
pagerank_scores = pagerank_calculator.calculator()
# print length of pagerank_scores
print(f"Length of pagerank_scores: {len(pagerank_scores)}")

Length of pagerank_scores: 337


In [214]:
summarizer = Summarizer(
    sentences_dict=sentences_dict,
    pagerank_scores=pagerank_scores,
    top_percent=0.1
)

summary_sentences_dict = summarizer.get_summary_dict()

# print summary sentences and its id metadata from sentences_dict
summary_sentences_dict





{100: 'After years of delay and a last-minute snag, the Hubble Space Telescope was freed from the shuttle Discovery on Wednesday and, glinting in the sunlight, drifted into orbit on its 15-year search for new worlds.',
 249: 'The telescope, named for the late astronomer Edwin P. Hubble, will enable astronomers over its 15-year working lifetime to look back 14 billion years and possibly determine the age of the universe.',
 39: 'Columbia, which journeys into space next month with the Astro observatory, sat on a launch pad 1.6 miles from Discovery, only the second time both pads have been occupied; the first was in 1986 just before the Challenger explosion.',
 223: "Shortly after 6 a.m. EDT, controllers at the Goddard Space Flight Center in Greenbelt, Md., completed a series of commands that took the telescope out of its safe mode and then began to run a 12-hour computer program to monitor Hubble's health.",
 91: 'Should the solar panels not deploy, mission specialists Bruce McCandless a

In [215]:
for sentence_id in summary_sentences_dict.keys():
                data = sentences_dict[sentence_id]
                doc_id = data.get('doc_id', 'unknown')
                wdcount = data.get('wdcount', '0')
                num = data.get('num', '0')
                sentence_text = data.get('sentence_text', '')
                # Write sentence in original tag format
                print(f'<s doc_id="{doc_id}" num="{num}" wdcount="{wdcount}"> {sentence_text}</s>\n')

<s doc_id="AP900425-0198" num="11" wdcount="35"> After years of delay and a last-minute snag, the Hubble Space Telescope was freed from the shuttle Discovery on Wednesday and, glinting in the sunlight, drifted into orbit on its 15-year search for new worlds.</s>

<s doc_id="AP900428-0050" num="51" wdcount="32"> The telescope, named for the late astronomer Edwin P. Hubble, will enable astronomers over its 15-year working lifetime to look back 14 billion years and possibly determine the age of the universe.</s>

<s doc_id="AP900424-0048" num="21" wdcount="39"> Columbia, which journeys into space next month with the Astro observatory, sat on a launch pad 1.6 miles from Discovery, only the second time both pads have been occupied; the first was in 1986 just before the Challenger explosion.</s>

<s doc_id="AP900428-0050" num="25" wdcount="42"> Shortly after 6 a.m. EDT, controllers at the Goddard Space Flight Center in Greenbelt, Md., completed a series of commands that took the telescope ou

In [216]:
# evaluation the summary
#count matching sentences in the summary with the preferences file
preference_file = FileReader(preferences_file_path).read_file()
preference_sentences_dict = ParseDoc.parse_doc(preference_file)
# preferences_sentences   
evaluator = Evaluator(
        sentences_dict=sentences_dict,
        summary_sentence_ids=summarizer.get_top_sentence_ids(),
        preference_sum_dict=preference_sentences_dict
    )
evaluation_results = evaluator.evaluate()
evaluation_results


{'labeled': 16,
 'extracted': 33,
 'matched': 1,
 'recall': 6.25,
 'precision': 3.03,
 'f1': 4.08}