In [2]:
from Sum_module.file_reader import FileReader
from Sum_module.parse_doc import ParseDoc
from Sum_module.preprocess import Preprocessor
from Sum_module.connections import ConnectionMatrix
from Sum_module.pagerank import PageRankCalculator
from Sum_module.summarizer import Summarizer
from Sum_module.output_writer import OutputWriter
from Sum_module.evaluation import Evaluator

import json
import os

In [3]:
filename = "d112h"
input_file_path = f"data/DUC_TEXT/test/{filename}"
preferences_file_path = f"data/DUC_SUM/{filename}"

print(f"Processing file: {input_file_path}")

Processing file: data/DUC_TEXT/test/d112h


In [4]:
doc_file = FileReader(input_file_path).read_file()
doc_file

'<s docid="FT922-1115" num="6" wdcount="50"> MR MICHAEL STONEY, a senior executive in a number of Maxwell companies, is named as one of three people primarily responsible for more than Pounds 180m of \'unusual\' payments from Mirror Group Newspapers bank accounts during the last financial year, according to the chairman\'s statement released with MGN accounts yesterday.</s>\n<s docid="FT922-1115" num="7" wdcount="26"> Mr Kevin Maxwell and Mr Ian Maxwell, two sons of the late Robert Maxwell, are named as the other two people most involved in these transactions.</s>\n<s docid="FT922-1115" num="8" wdcount="15"> Mr Kevin Maxwell and Mr Stoney refused to provide MGN with information, the statement says.</s>\n<s docid="FT922-1115" num="9" wdcount="31"> Sir Robert Clark, chairman of MGN, lists a series of transactions - some of which he stresses may have been perfectly legitimate - that took place in the past few months.</s>\n<s docid="FT922-1115" num="10" wdcount="40"> He says legal action m

In [5]:
sentences_dict = ParseDoc.parse_doc(doc_file)
sentences_dict

{0: {'doc_id': 'FT922-1115',
  'num': '6',
  'wdcount': 50,
  'sentence_text': "MR MICHAEL STONEY, a senior executive in a number of Maxwell companies, is named as one of three people primarily responsible for more than Pounds 180m of 'unusual' payments from Mirror Group Newspapers bank accounts during the last financial year, according to the chairman's statement released with MGN accounts yesterday."},
 1: {'doc_id': 'FT922-1115',
  'num': '7',
  'wdcount': 26,
  'sentence_text': 'Mr Kevin Maxwell and Mr Ian Maxwell, two sons of the late Robert Maxwell, are named as the other two people most involved in these transactions.'},
 2: {'doc_id': 'FT922-1115',
  'num': '8',
  'wdcount': 15,
  'sentence_text': 'Mr Kevin Maxwell and Mr Stoney refused to provide MGN with information, the statement says.'},
 3: {'doc_id': 'FT922-1115',
  'num': '9',
  'wdcount': 31,
  'sentence_text': 'Sir Robert Clark, chairman of MGN, lists a series of transactions - some of which he stresses may have been p

In [6]:
preprocessor = Preprocessor(use_lemmatizer=True, language="english")
preprocessed_sentences_dict = preprocessor.preprocess_dict(sentences_dict)
preprocessed_sentences_dict

{0: 'mr michael stoney senior executive number maxwell company named one three people primarily responsible pound 180m unusual payment mirror group newspaper bank account last financial year according chairman statement released mgn account yesterday',
 1: 'mr kevin maxwell mr ian maxwell two son late robert maxwell named two people involved transaction',
 2: 'mr kevin maxwell mr stoney refused provide mgn information statement say',
 3: 'sir robert clark chairman mgn list series transaction stress may perfectly legitimate took place past month',
 4: 'say legal action may taken number organisation including goldman sachs u investment bank pound 40m transfer mgn bank aware effected improper purpose',
 5: 'note account show mgn made extraordinary provision pound 4215m year december 29 1991 including pound 2086m pension deficiency pound 1224m transaction maxwellcontrolled company',
 6: 'sir robert say year least 28 unusual payment pound 1m group bank account making total pound 180m',
 7: 

In [7]:
connection_matrix = ConnectionMatrix(
    sentences = list(preprocessed_sentences_dict.values()),
    min_common_words = 4,
    max_common_words= 10,
).create_matrix()
connection_matrix

array([[False,  True,  True, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]], shape=(238, 238))

In [8]:
connection_matrix_test = [
                      [0, 1, 1, 1, 1, 0],
                      [1, 0, 1, 1, 1, 0],
                      [1, 1, 0, 1, 0, 0],
                      [1, 1, 1, 0, 0, 1],
                      [1, 1, 0, 0, 0, 1],
                      [0, 0, 0, 1, 1, 0]
                    ]
pagerank_sores = PageRankCalculator(connection_matrix_test, max_iterations=1).calculator()
pagerank_sores

array([1.01666667, 1.01666667, 0.6625    , 1.15833333, 0.875     ,
       0.52083333])

In [9]:
pagerank_calculator = PageRankCalculator(
    connection_matrix=connection_matrix,
    max_iterations=100,
    tolerance=1e-6
)
pagerank_scores = pagerank_calculator.calculator()
# print length of pagerank_scores
print(f"Length of pagerank_scores: {len(pagerank_scores)}")

Length of pagerank_scores: 238


In [10]:
summarizer = Summarizer(
    sentences_dict=sentences_dict,
    pagerank_scores=pagerank_scores,
    top_percent=0.1
)

summary_sentences_dict = summarizer.get_summary_dict()

# print summary sentences and its id metadata from sentences_dict
summary_sentences_dict





{63: 'On May 29 last year, Mr Kevin Maxwell sent a fax to Mr Larry Wood, an executive director of Goldman Sachs, telling him that two parcels of 12.5m MCC shares each would be bought by the Swiss trusts with Pounds 55.33m provided by BIT, a Maxwell private company.',
 115: "The writs also allege that on May 28 1991 - the same date Mr Kevin Maxwell arranged for the pension fund share transaction to be paid for - Goldman was due to receive Dollars 58.2m for an unrelated transaction selling MCC stock to an unnamed American lawyer 'with close connections to Robert Maxwell'.",
 16: "In his report Sir Robert points to 'certain weaknesses' accepted by the MGN board: Internal controls and operating procedures which failed to identify related party transactions and bring them to the attention of independent directors for approval; Bank mandates authorised by Robert Maxwell and Mr Stoney which permitted the movement of group funds on the authority of Maxwell or directors who were also directors 

In [11]:
for sentence_id in summary_sentences_dict.keys():
                data = sentences_dict[sentence_id]
                doc_id = data.get('doc_id', 'unknown')
                wdcount = data.get('wdcount', '0')
                num = data.get('num', '0')
                sentence_text = data.get('sentence_text', '')
                # Write sentence in original tag format
                print(f'<s doc_id="{doc_id}" num="{num}" wdcount="{wdcount}"> {sentence_text}</s>\n')

<s doc_id="FT922-3446" num="23" wdcount="48"> On May 29 last year, Mr Kevin Maxwell sent a fax to Mr Larry Wood, an executive director of Goldman Sachs, telling him that two parcels of 12.5m MCC shares each would be bought by the Swiss trusts with Pounds 55.33m provided by BIT, a Maxwell private company.</s>

<s doc_id="FT942-12054" num="11" wdcount="53"> The writs also allege that on May 28 1991 - the same date Mr Kevin Maxwell arranged for the pension fund share transaction to be paid for - Goldman was due to receive Dollars 58.2m for an unrelated transaction selling MCC stock to an unnamed American lawyer 'with close connections to Robert Maxwell'.</s>

<s doc_id="FT922-1115" num="22" wdcount="97"> In his report Sir Robert points to 'certain weaknesses' accepted by the MGN board: Internal controls and operating procedures which failed to identify related party transactions and bring them to the attention of independent directors for approval; Bank mandates authorised by Robert Maxwe