In [12]:
import os
import re
import numpy as np

Read file from DUC_TEXT/test
Create a dictionary save all the metadata of sentences
Create unique sentence_id

In [13]:
# Read file from DUC_TEXT/test

file_path = "DUC_TEXT/test/d112h"  

with open(file_path, "r", encoding="utf-8") as file:
    doc_file = file.read()

# print("Read file from DUC_TEXT/test")
# print(doc_file)

In [14]:
# Create a dictionary save all the metadata of sentences
def parse_doc (doc_file):
    """
    Parse the document file and extract sentences metadata.
    
    Args:
        doc_file (str): The content of the document file.
        
    Returns:
        dict: A dictionary with sentence_id as keys and metadata as values.
    """
    sentences_dict = {}
    # Create unique sentence_id
    # Initialize sentence_id to 0
    sentence_id = 0

    # Find all sentences tags and their content in the document
    sentence_matches = re.findall(r'<s\s+docid="([^"]+)"\s+num="([^"]+)"\s+wdcount="([^"]+)">\s*(.*?)\s*</s>', doc_file, re.DOTALL)

    for doc_id, num, wdcount, sentence_text in sentence_matches:
        sentences_dict[sentence_id] = {
            "doc_id": doc_id,
            "num": num,
            "wdcount": int(wdcount),
            "sentence_text": sentence_text.strip()
        }
        sentence_id += 1    
    return sentences_dict

sentences_dict = parse_doc(doc_file)
# Print the sentences dictionary
print(sentences_dict)
# for sid, metadata in sentences_dict.items():
#     print(f"Sentence ID: {sid}, Metadata: {metadata}")
# print(f"Total sentences processed: {len(sentences_dict)}")

{0: {'doc_id': 'FT922-1115', 'num': '6', 'wdcount': 50, 'sentence_text': "MR MICHAEL STONEY, a senior executive in a number of Maxwell companies, is named as one of three people primarily responsible for more than Pounds 180m of 'unusual' payments from Mirror Group Newspapers bank accounts during the last financial year, according to the chairman's statement released with MGN accounts yesterday."}, 1: {'doc_id': 'FT922-1115', 'num': '7', 'wdcount': 26, 'sentence_text': 'Mr Kevin Maxwell and Mr Ian Maxwell, two sons of the late Robert Maxwell, are named as the other two people most involved in these transactions.'}, 2: {'doc_id': 'FT922-1115', 'num': '8', 'wdcount': 15, 'sentence_text': 'Mr Kevin Maxwell and Mr Stoney refused to provide MGN with information, the statement says.'}, 3: {'doc_id': 'FT922-1115', 'num': '9', 'wdcount': 31, 'sentence_text': 'Sir Robert Clark, chairman of MGN, lists a series of transactions - some of which he stresses may have been perfectly legitimate - that 

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import json

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

# Create a new dictionary with only sentence_id and fulltext
sentence_text_dict = {
    sentence_id: data['sentence_text']
    for sentence_id, data in sentences_dict.items()
}

# Initialize the stopword list and Lemmatizer
stop_words = set(stopwords.words('english'))
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text into words
    words = text.split()
    # Remove stopwords and stem words
    # processed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Lemmatize words instead of stemming
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
   
    # Join the processed words back into a string
    return " ".join(processed_words)

# Process each sentence in the new dictionary
processed_sentence_text_dict = {
    sentence_id: preprocess_text(text)
    for sentence_id, text in sentence_text_dict.items()
}

# Optional: Print the processed dictionary to verify
print(json.dumps(processed_sentence_text_dict, indent=2))

{
  "0": "mr michael stoney senior executive number maxwell company named one three people primarily responsible pound 180m unusual payment mirror group newspaper bank account last financial year according chairman statement released mgn account yesterday",
  "1": "mr kevin maxwell mr ian maxwell two son late robert maxwell named two people involved transaction",
  "2": "mr kevin maxwell mr stoney refused provide mgn information statement say",
  "3": "sir robert clark chairman mgn list series transaction stress may perfectly legitimate took place past month",
  "4": "say legal action may taken number organisation including goldman sachs u investment bank pound 40m transfer mgn bank aware effected improper purpose",
  "5": "note account show mgn made extraordinary provision pound 4215m year december 29 1991 including pound 2086m pension deficiency pound 1224m transaction maxwellcontrolled company",
  "6": "sir robert say year least 28 unusual payment pound 1m group bank account making 

Calculate connection by n common word

In [16]:
def has_connection(sentence1, sentence2, min_common_words=4,max_common_words=10):
    """
    Check if two sentences have a connection based on common words.
    
    Args:
        sentence1 (str): The first sentence.
        sentence2 (str): The second sentence.
        mincommon_words (int): Minimum number of common words to consider a connection.
        
    Returns:
        bool: True if there is a connection, False otherwise.
    """
    words1 = set(re.findall(r'\b\w+\b', sentence1.lower()))
    words2 = set(re.findall(r'\b\w+\b', sentence2.lower()))
    common_words = words1.intersection(words2)
    return len(common_words) >= min_common_words and len(common_words) <= max_common_words

Create connection matrix

In [17]:
def create_connection_matrix(sentences_dict):
    """
    Create a connection matrix based on the sentences metadata.
    
    Args:
        sentences_dict (dict): A dictionary with sentence_id as keys and metadata as values.
        
    Returns:
        np.ndarray: A balance two-dimentional matrix where matrix[i][j] is True
                    if sentence i and sentence j have connection, and 0 otherwise.
    """
    num_sentences = len(sentences_dict)
    connection_matrix = np.full((num_sentences, num_sentences), False, dtype=bool)

    # Get all sentence texts
    sentence_texts = [sentences_dict[i] for i in range(num_sentences)]
    # print("Sentence texts:")
    # print(sentence_texts)
    # Compare each pair of sentences
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            if has_connection(sentence_texts[i], sentence_texts[j]):
                connection_matrix[i][j] = True
                connection_matrix[j][i] = True


    return connection_matrix
connection_matrix = create_connection_matrix(processed_sentence_text_dict)
# Print the connection matrix
print("Connection Matrix:")
# for row in connection_matrix:
#     print(row)
#     print()  # Add a newline for better readability

# print(connection_matrix)

Connection Matrix:


Calculate pageRank score
The PageRank formula is defined as:

$$
PR(i) = \frac{1 - d}{N} + d \sum_{j \in M(i)} \frac{PR(j)}{deg(j)}
$$

Where:

- $PR(i)$ is the PageRank of node $i$
- $d$ is the damping factor (usually 0.85)
- $N$ is the total number of nodes
- $M(i)$ are nodes linking to $i$
- $L(j)$ is the number of links from node $j$

In [18]:
# Calculate pagerank score
def calculate_pagerank(connection_matrix, d=0.85, max_iterations=100, tolerance=1e-3):    
  num_nodes = len(connection_matrix)
  
  pagerank_scores = np.ones(num_nodes)  # Initialize all PR scores to 1

  connection_matrix = np.array(connection_matrix) # Convert list to numpy array
  transition_matrix = np.zeros_like(connection_matrix, dtype=float) # Initialize transition matrix as numpy array
  row_sum = np.sum(connection_matrix, axis=1)

  for i in range  (num_nodes):
    if row_sum[i] > 0:
      transition_matrix[i,:] = connection_matrix[i,:] / row_sum[i]
    # if row_sum is 0, the row in transition_matrix remains 0

  # print(connection_matrix)
  # print(row_sum)
  # print(pagerank)

  for iteration in range(max_iterations):
    # PageRank formula: PR(A) = (1-d)/N + d * sum(PR(B)/L(B)) for all pages B pointing to A
    # In matrix form: new_pagerank = (1-d)/N + d * transition_matrix.T @ pagerank
    # new_pagerank = np.full(num_nodes, (1 - damping_factor) / num_nodes) + \
    #                    damping_factor * np.dot(pagerank, transition_matrix) # Corrected matrix multiplication order
    new_pagerank_scores = (1 - d) / num_nodes + d * transition_matrix.T @ pagerank_scores
    # Check for convergence
    if np.linalg.norm(new_pagerank_scores - pagerank_scores, ord=1) < tolerance:
            print(f"PageRank converged after {iteration + 1} iterations.")
            break
    # print(f'Interation {iteration+1}')
    # print(new_pagerank)
    pagerank_scores = new_pagerank_scores

  return pagerank_scores
pagerank_scores = calculate_pagerank(connection_matrix)
# Print the PageRank scores
# print("PageRank Scores:")
# for i, score in enumerate(pagerank_scores):
#     print(f"Sentence ID {i}: {score:.4f} - {sentences_dict[i]['sentence_text']}")    
    
                                    
    

PageRank converged after 61 iterations.


Get the 10% highest score sentences to create a summary

In [19]:
# Get the 10% highest score sentences to create a summary
summary_sentences = sorted(range(len(pagerank_scores)), key=lambda i: pagerank_scores[i], reverse=True)[:int(0.1 * len(pagerank_scores))]
print("Summary Sentences:")
for i in summary_sentences:
    print(f"Sentence ID {i}: {pagerank_scores[i]:.8f} - {sentences_dict[i]['sentence_text']}")
# summary = [sentence_dict[i]['sentence_text'] for i in summary_sentences]
# print("\nSummary Sentences:")
# for sentence in summary:
#     print(sentence)


Summary Sentences:
Sentence ID 63: 0.01495515 - On May 29 last year, Mr Kevin Maxwell sent a fax to Mr Larry Wood, an executive director of Goldman Sachs, telling him that two parcels of 12.5m MCC shares each would be bought by the Swiss trusts with Pounds 55.33m provided by BIT, a Maxwell private company.
Sentence ID 115: 0.01448591 - The writs also allege that on May 28 1991 - the same date Mr Kevin Maxwell arranged for the pension fund share transaction to be paid for - Goldman was due to receive Dollars 58.2m for an unrelated transaction selling MCC stock to an unnamed American lawyer 'with close connections to Robert Maxwell'.
Sentence ID 16: 0.01228350 - In his report Sir Robert points to 'certain weaknesses' accepted by the MGN board: Internal controls and operating procedures which failed to identify related party transactions and bring them to the attention of independent directors for approval; Bank mandates authorised by Robert Maxwell and Mr Stoney which permitted the movem

In [20]:
# Create output directory if it doesn't exist
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

# Get the input file name without the extension
input_filename = os.path.splitext(os.path.basename(file_path))[0]

# Define the output file path
output_file_path = os.path.join(output_dir, f'{input_filename}_commonword')

# Write the top sentences to the output file in the desired format
with open(output_file_path, 'w') as outfile:
    for sentence_id in summary_sentences:
        doc_id = sentences_dict[sentence_id]['doc_id']
        wdcount = sentences_dict[sentence_id]['wdcount']
        num = sentences_dict[sentence_id]['num']
        sentence_text = sentences_dict[sentence_id]['sentence_text']
        # Reconstruct the original sentence tag format
        outfile.write(f'<s doc_id="{doc_id}" num="{num}" wdcount="{wdcount}"> {sentence_text}</s>\n')

print(f"\nTop 10% sentences written to {output_file_path}")


Top 10% sentences written to output/d112h_commonword


In [21]:
# Evaluate the result with the summay file has given in DUC_SUM folder

preference_sum_path = "DUC_SUM/d112h"  
with open(preference_sum_path, "r", encoding="utf-8") as file:
    preference_sum_file = file.read()

preference_sum_dict = parse_doc(preference_sum_file)
print("Preference Summary Sentences:")
for sid, metadata in preference_sum_dict.items():
    print(f"Sentence ID: {sid}, Metadata: {metadata}")  




Preference Summary Sentences:
Sentence ID: 0, Metadata: {'doc_id': 'SJMN91-06311092', 'num': '5', 'wdcount': 29, 'sentence_text': 'Robert Maxwell, the flamboyant billionaire who built a global publishing empire, was found dead Tuesday in waters off the Canary Islands, where he had been vacationing on his yacht.'}
Sentence ID: 1, Metadata: {'doc_id': 'SJMN91-06311092', 'num': '10', 'wdcount': 30, 'sentence_text': "Maxwell's death ended the reign of a contentious media baron who battled unions, barked out orders in eight languages and bullied editors as he built a $2 billion media conglomerate."}
Sentence ID: 2, Metadata: {'doc_id': 'SJMN91-06311092', 'num': '16', 'wdcount': 34, 'sentence_text': "His death prompted immediate concern in British financial circles about the future of his debt-laden empire and in New York about the future of the Daily News, one of the nation's largest metropolitan papers."}
Sentence ID: 3, Metadata: {'doc_id': 'FT922-2972', 'num': '6', 'wdcount': 26, 'senten

In [22]:
# Get the set of (doc_id, num) for sentences in preference_sum_dict
preference_keys = set((v['doc_id'], v['num']) for v in preference_sum_dict.values())

print(f"Preference keys: {preference_keys}")  

# Count how many summary_sentences are present in preference_sum_dict by (doc_id, num)
matched = 0
for sid in summary_sentences:
    sent = sentences_dict[sid]
    # print(f"Checking sentence ID {sid}: {sent['doc_id']}, {sent['num']}, {sent['sentence_text']}")
    if (sent['doc_id'], sent['num']) in preference_keys:
        print(f"Matched: {sent['doc_id']}, {sent['num']}, {sent['sentence_text']}")
        matched += 1
# Print the number of matched sentences
print (f"Number of matched sentences: {matched}")
# Calculate recall and precision
recall_percentage = (matched / len(preference_sum_dict)) * 100 if preference_sum_dict else 0
precision_percentage = (matched / len(summary_sentences)) * 100 if summary_sentences else 0
# Print recall and precision
print(f"Recall: {recall_percentage:.2f}%")
print(f"Precision: {precision_percentage:.2f}%")
# Calculate F1 score
f1_percentage = (2 * recall_percentage * precision_percentage) / (recall_percentage + precision_percentage) if (recall_percentage + precision_percentage) > 0 else 0
print(f"F1 Score: {f1_percentage:.2f}%")
# print(f"{matched} out of {len(preference_sum_dict)} summary sentences are in the preference summary ({percentage:.2f}%)")

Preference keys: {('FT922-3446', '6'), ('SJMN91-06311092', '16'), ('SJMN91-06311092', '5'), ('FT922-2972', '8'), ('FT941-11518', '7'), ('SJMN91-06311092', '10'), ('FT941-11518', '6'), ('FT942-12054', '6'), ('SJMN91-06346032', '25'), ('FT922-2972', '6'), ('SJMN91-06346032', '24')}
Matched: FT922-3446, 6, DOCUMENTS which show for the first time that both Mr Robert Maxwell and his son Kevin were intimately involved in substantial purchases of shares in Maxwell Communication Corporation, one of their own public companies, have been obtained by the Financial Times.
Matched: FT942-12054, 6, Two writs have been filed alleging that Goldman Sachs, the US-based investment bank, assisted in diverting Pounds 55m from two pension schemes controlled by Robert Maxwell to ensure its own debts from Maxwell interests would be repaid.
Number of matched sentences: 2
Recall: 18.18%
Precision: 8.70%
F1 Score: 11.76%
