In [54]:
import os
import re
import numpy as np

Read file from DUC_TEXT/test
Create a dictionary save all the metadata of sentences
Create unique sentence_id

In [55]:
# Read file from DUC_TEXT/test

file_path = "DUC_TEXT/test/d112h"  

with open(file_path, "r", encoding="utf-8") as file:
    doc_file = file.read()

# print("Read file from DUC_TEXT/test")
# print(doc_file)

In [56]:
# Create a dictionary save all the metadata of sentences
def parse_doc (doc_file):
    """
    Parse the document file and extract sentences metadata.
    
    Args:
        doc_file (str): The content of the document file.
        
    Returns:
        dict: A dictionary with sentence_id as keys and metadata as values.
    """
    sentences_dict = {}
    # Create unique sentence_id
    # Initialize sentence_id to 0
    sentence_id = 0

    # Find all sentences tags and their content in the document
    sentence_matches = re.findall(r'<s\s+docid="([^"]+)"\s+num="([^"]+)"\s+wdcount="([^"]+)">\s*(.*?)\s*</s>', doc_file, re.DOTALL)

    for doc_id, num, wdcount, sentence_text in sentence_matches:
        sentences_dict[sentence_id] = {
            "doc_id": doc_id,
            "num": num,
            "wdcount": int(wdcount),
            "sentence_text": sentence_text.strip()
        }
        sentence_id += 1    
    return sentences_dict

sentences_dict = parse_doc(doc_file)
# Print the sentences dictionary
# print(sentences_dict)
# for sid, metadata in sentences_dict.items():
#     print(f"Sentence ID: {sid}, Metadata: {metadata}")
# print(f"Total sentences processed: {len(sentences_dict)}")

Calculate connection by n common word

In [57]:
def has_connection(sentence1, sentence2, min_common_words=8):
    """
    Check if two sentences have a connection based on common words.
    
    Args:
        sentence1 (str): The first sentence.
        sentence2 (str): The second sentence.
        mincommon_words (int): Minimum number of common words to consider a connection.
        
    Returns:
        bool: True if there is a connection, False otherwise.
    """
    words1 = set(re.findall(r'\b\w+\b', sentence1.lower()))
    words2 = set(re.findall(r'\b\w+\b', sentence2.lower()))
    common_words = words1.intersection(words2)
    return len(common_words) >= min_common_words

Create connection matrix

In [58]:
def create_connection_matrix(sentences_dict):
    """
    Create a connection matrix based on the sentences metadata.
    
    Args:
        sentences_dict (dict): A dictionary with sentence_id as keys and metadata as values.
        
    Returns:
        np.ndarray: A balance two-dimentional matrix where matrix[i][j] is True
                    if sentence i and sentence j have connection, and 0 otherwise.
    """
    num_sentences = len(sentences_dict)
    connection_matrix = np.full((num_sentences, num_sentences), False, dtype=bool)

    # Get all sentence texts
    sentence_texts = [sentences_dict[i]['sentence_text'] for i in range(num_sentences)]

    # Compare each pair of sentences
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            if has_connection(sentence_texts[i], sentence_texts[j]):
                connection_matrix[i][j] = True
                connection_matrix[j][i] = True


    return connection_matrix
# import create_connection_matrix as ccm
# connection_matrix = ccm.create_connection_matrix(sentences_dict)
# Print the connection matrix
# print("Connection Matrix:")
connection_matrix = create_connection_matrix(sentences_dict)
# print(connection_matrix)
# print("Connection Details:")
# for i in range(connection_matrix.shape[0]):
#     for j in range(connection_matrix.shape[1]):
#         print(connection_matrix[i][j], end=' ')
#     print()  # New line for each row

Calculate pageRank score
The PageRank formula is defined as:

$$
PR(i) = \frac{1 - d}{N} + d \sum_{j \in M(i)} \frac{PR(j)}{L(j)}
$$

Where:

- $PR(i)$ is the PageRank of node $i$
- $d$ is the damping factor (usually 0.85)
- $N$ is the total number of nodes
- $M(i)$ are nodes linking to $i$
- $L(j)$ is the number of outgoing links from node $j$

In [59]:
def calculate_pagerank(connection_matrix, d=0.85, max_iter=100, tol=1e-6):
    """
    Calculate PageRank scores based on the connection matrix.
    
    Args:
        connection_matrix (np.ndarray): The connection matrix.
        d (float): Damping factor.
        max_iter (int): Maximum number of iterations.
                    This stops the algorithm if it doesnt converge quickly to avoid infinite loops.
        tol (float): Tolerance for convergence. 
                    If the change in PageRank scores between iterations is less than this value, 
                    the algorithm stops early (it's considered converged)
        
    Returns:
        np.ndarray: PageRank scores for each sentence.
    """
    # Find the number of nodes in the connection matrix by getting the number of rows of adjacency matrix
    num_nodes = connection_matrix.shape[0]
    # Convert matrix elements to float
    transition_matrix = connection_matrix.astype(float) 
    # Sum up each row to get the total number of connections for each sentence
    row_sums = transition_matrix.sum(axis=1)

    # Normalize the transition matrix by dividing each row by its sum
    # This ensures that the sum of probabilities in each row equals 1
    for i in range(num_nodes):
        if row_sums[i] > 0:
            transition_matrix[i] /= row_sums[i]
        else:
            # Handle dangling nodes (nodes with no outgoing edges)
            # Assign equal probability to all nodes
            transition_matrix[i] = np.ones(num_nodes) / num_nodes
    # Initialize PageRank scores to 1/n for each node
    pagerank_scores = np.ones(num_nodes) / num_nodes
    # Iterate to update PageRank scores
    for _ in range(max_iter):
        new_pagerank_scores = np.zeros(num_nodes)
        for i in range(num_nodes):
            # Calculate the new PageRank score for each node
            new_pagerank_scores[i] = (1 - d) / num_nodes + d * np.sum(transition_matrix[:, i] * pagerank_scores)
        # Check for convergence
        if np.linalg.norm(new_pagerank_scores - pagerank_scores, ord=1) < tol:
            break
        pagerank_scores = new_pagerank_scores
    return pagerank_scores
pagerank_scores = calculate_pagerank(connection_matrix)
# Print the PageRank scores
# print("PageRank Scores:")
# for i, score in enumerate(pagerank_scores):
#     print(f"Sentence ID {i}: {score:.4f} - {sentences_dict[i]['sentence_text']}")    
    
                                    
    

Get the 10% highest score sentences to create a summary

In [60]:
# summary_sentences = sorted(range(len(pagerank_scores)), key=lambda i: pagerank_scores[i], reverse=True)[:int(0.1 * len(pagerank_scores))]
# summary = [sentence_dict[i]['sentence_text'] for i in summary_sentences]
# print("\nSummary Sentences:")
# for sentence in summary:
#     print(sentence)
# Get the 10% highest score sentences to create a summary
summary_sentences = sorted(range(len(pagerank_scores)), key=lambda i: pagerank_scores[i], reverse=True)[:int(0.1 * len(pagerank_scores))]
print("Summary Sentences:")
for i in summary_sentences:
    print(f"Sentence ID {i}: {pagerank_scores[i]:.4f} - {sentences_dict[i]['sentence_text']}")
# summary = [sentence_dict[i]['sentence_text'] for i in summary_sentences]
# print("\nSummary Sentences:")
# for sentence in summary:
#     print(sentence)


Summary Sentences:
Sentence ID 16: 0.0463 - In his report Sir Robert points to 'certain weaknesses' accepted by the MGN board: Internal controls and operating procedures which failed to identify related party transactions and bring them to the attention of independent directors for approval; Bank mandates authorised by Robert Maxwell and Mr Stoney which permitted the movement of group funds on the authority of Maxwell or directors who were also directors of Maxwell-controlled companies; The audit committee of non-executive directors, which might have reviewed systems and mandates, but was not convened; The finance department's inadequate authority to verify and record the treasury department's activities.
Sentence ID 99: 0.0367 - Press coverage of the late publisher since his death has already caused deep concern to the Maxwell brothers' lawyers . Mr Alun Jones QC, Kevin Maxwell's barrister, led calls for reporting restrictions to be placed on a recent High Court ruling that Robert Max

In [61]:
# Create output directory if it doesn't exist
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

# Get the input file name without the extension
input_filename = os.path.splitext(os.path.basename(file_path))[0]

# Define the output file path
output_file_path = os.path.join(output_dir, f'{input_filename}_commonword')

# Write the top sentences to the output file in the desired format
with open(output_file_path, 'w') as outfile:
    for sentence_id in summary_sentences:
        doc_id = sentences_dict[sentence_id]['doc_id']
        wdcount = sentences_dict[sentence_id]['wdcount']
        num = sentences_dict[sentence_id]['num']
        sentence_text = sentences_dict[sentence_id]['sentence_text']
        # Reconstruct the original sentence tag format
        outfile.write(f'<s doc_id="{doc_id}" num="{num}" wdcount="{wdcount}"> {sentence_text}</s>\n')

print(f"\nTop 10% sentences written to {output_file_path}")


Top 10% sentences written to output/d112h_commonword


In [62]:
# Evaluate the result with the summay file has given in DUC_SUM folder

preference_sum_path = "DUC_SUM/d112h"  
with open(preference_sum_path, "r", encoding="utf-8") as file:
    preference_sum_file = file.read()

preference_sum_dict = parse_doc(preference_sum_file)
# compare the summary_sentences with the preference summary 



In [63]:
# Get the set of (doc_id, num) for sentences in preference_sum_dict
preference_keys = set((v['doc_id'], v['num']) for v in preference_sum_dict.values())

# Count how many summary_sentences are present in preference_sum_dict by (doc_id, num)
matched = 0
for sid in summary_sentences:
    sent = sentences_dict[sid]
    if (sent['doc_id'], sent['num']) in preference_keys:
        matched += 1

percentage = (matched / len(preference_sum_dict)) * 100 if preference_sum_dict else 0
print(f"{matched} out of {len(preference_sum_dict)} summary sentences are in the preference summary ({percentage:.2f}%)")

2 out of 11 summary sentences are in the preference summary (18.18%)
