In [32]:
import os
import re
import numpy as np

Read file from DUC_TEXT/test
Create a dictionary save all the metadata of sentences
Create unique sentence_id

In [33]:
# Read file from DUC_TEXT/test

file_path = "DUC_TEXT/test/d114h"  

with open(file_path, "r", encoding="utf-8") as file:
    doc_file = file.read()

# print("Read file from DUC_TEXT/test")
# print(doc_file)

In [34]:
# Create a dictionary save all the metadata of sentences
def parse_doc (doc_file):
    """
    Parse the document file and extract sentences metadata.
    
    Args:
        doc_file (str): The content of the document file.
        
    Returns:
        dict: A dictionary with sentence_id as keys and metadata as values.
    """
    sentences_dict = {}
    # Create unique sentence_id
    # Initialize sentence_id to 0
    sentence_id = 0

    # Find all sentences tags and their content in the document
    sentence_matches = re.findall(r'<s\s+docid="([^"]+)"\s+num="([^"]+)"\s+wdcount="([^"]+)">\s*(.*?)\s*</s>', doc_file, re.DOTALL)

    for doc_id, num, wdcount, sentence_text in sentence_matches:
        sentences_dict[sentence_id] = {
            "doc_id": doc_id,
            "num": num,
            "wdcount": int(wdcount),
            "sentence_text": sentence_text.strip()
        }
        sentence_id += 1    
    return sentences_dict

sentences_dict = parse_doc(doc_file)
# Print the sentences dictionary
# print(sentences_dict)
# for sid, metadata in sentences_dict.items():
#     print(f"Sentence ID: {sid}, Metadata: {metadata}")
# print(f"Total sentences processed: {len(sentences_dict)}")

Calculate connection by n common word

In [35]:
def has_connection(sentence1, sentence2, min_common_words=7):
    """
    Check if two sentences have a connection based on common words.
    
    Args:
        sentence1 (str): The first sentence.
        sentence2 (str): The second sentence.
        mincommon_words (int): Minimum number of common words to consider a connection.
        
    Returns:
        bool: True if there is a connection, False otherwise.
    """
    words1 = set(re.findall(r'\b\w+\b', sentence1.lower()))
    words2 = set(re.findall(r'\b\w+\b', sentence2.lower()))
    common_words = words1.intersection(words2)
    return len(common_words) >= min_common_words

Create connection matrix

In [36]:
def create_connection_matrix(sentences_dict):
    """
    Create a connection matrix based on the sentences metadata.
    
    Args:
        sentences_dict (dict): A dictionary with sentence_id as keys and metadata as values.
        
    Returns:
        np.ndarray: A balance two-dimentional matrix where matrix[i][j] is True
                    if sentence i and sentence j have connection, and 0 otherwise.
    """
    num_sentences = len(sentences_dict)
    connection_matrix = np.full((num_sentences, num_sentences), False, dtype=bool)

    # Get all sentence texts
    sentence_texts = [sentences_dict[i]['sentence_text'] for i in range(num_sentences)]

    # Compare each pair of sentences
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            if has_connection(sentence_texts[i], sentence_texts[j]):
                connection_matrix[i][j] = True
                connection_matrix[j][i] = True


    return connection_matrix
connection_matrix = create_connection_matrix(sentences_dict)
# Print the connection matrix
# print("Connection Matrix:")
# print(connection_matrix)

Calculate pageRank score
The PageRank formula is defined as:

$$
PR(i) = \frac{1 - d}{N} + d \sum_{j \in M(i)} \frac{PR(j)}{deg(j)}
$$

Where:

- $PR(i)$ is the PageRank of node $i$
- $d$ is the damping factor (usually 0.85)
- $N$ is the total number of nodes
- $M(i)$ are nodes linking to $i$
- $L(j)$ is the number of links from node $j$

In [37]:
def calculate_pagerank(connection_matrix, d=0.85, max_iter=100, tol=1e-6):
    """
    Calculate PageRank scores based on the connection matrix.
    
    Args:
        connection_matrix (np.ndarray): The connection matrix.
        d (float): Damping factor.
        max_iter (int): Maximum number of iterations.
                    This stops the algorithm if it doesnt converge quickly to avoid infinite loops.
        tol (float): Tolerance for convergence. 
                    If the change in PageRank scores between iterations is less than this value, 
                    the algorithm stops early (it's considered converged)
        
    Returns:
        np.ndarray: PageRank scores for each sentence.
    """
    # Find the number of nodes in the connection matrix by getting the number of rows of adjacency matrix
    num_nodes = connection_matrix.shape[0]
    # Convert matrix elements to float
    transition_matrix = connection_matrix.astype(float) 
    # Sum up each row to get the total number of connections for each sentence
    
    row_sums = transition_matrix.sum(axis=1)

    
    # Initialize PageRank scores to 1 for each node
    pagerank_scores = np.ones(num_nodes)
    # Iterate to update PageRank scores
    for _ in range(max_iter):
        new_pagerank_scores = np.full(num_nodes, (1 - d) / num_nodes)
        for i in range(num_nodes):
            for j in range(num_nodes): 
                if connection_matrix[j][i] == 1:
                    new_pagerank_scores[i] += d*(pagerank_scores[j] / row_sums[j])
        # Check for convergence
        if np.linalg.norm(new_pagerank_scores - pagerank_scores, ord=1) < tol:
            break
        pagerank_scores = new_pagerank_scores
    return pagerank_scores
pagerank_scores = calculate_pagerank(connection_matrix)
# Print the PageRank scores
# print("PageRank Scores:")
# for i, score in enumerate(pagerank_scores):
#     print(f"Sentence ID {i}: {score:.4f} - {sentences_dict[i]['sentence_text']}")    
    
                                    
    

Get the 10% highest score sentences to create a summary

In [38]:
# summary_sentences = sorted(range(len(pagerank_scores)), key=lambda i: pagerank_scores[i], reverse=True)[:int(0.1 * len(pagerank_scores))]
# summary = [sentence_dict[i]['sentence_text'] for i in summary_sentences]
# print("\nSummary Sentences:")
# for sentence in summary:
#     print(sentence)
# Get the 10% highest score sentences to create a summary
summary_sentences = sorted(range(len(pagerank_scores)), key=lambda i: pagerank_scores[i], reverse=True)[:int(0.1 * len(pagerank_scores))]
print("Summary Sentences:")
for i in summary_sentences:
    print(f"Sentence ID {i}: {pagerank_scores[i]:.8f} - {sentences_dict[i]['sentence_text']}")
# summary = [sentence_dict[i]['sentence_text'] for i in summary_sentences]
# print("\nSummary Sentences:")
# for sentence in summary:
#     print(sentence)


Summary Sentences:
Sentence ID 81: 0.01083495 - The deaths brought to 81 the number of people killed since Jan. 20 in clashes between security forces and Moslem militants fighting for secession of the Moslem-dominated Kashmir region from Hindu-majority India.
Sentence ID 47: 0.01069815 - ``We are neither fighting a war nor trying to suppress any popular movement,'' said Jagmohan, governor of Jammu-Kashmir, the only state in predominantly Hindu India with a Moslem majority.
Sentence ID 164: 0.01069223 - President Abdul Qayyum, speaking to a reporter who traveled to Pakistan's Kashmir, said the people of his state also had been sheltering insurgents since India launched the military crackdown in its state of Jammu-Kashmir last month.
Sentence ID 313: 0.00931257 - On Tuesday, at least 29 militants were killed by Indian security forces in Kashmir, the Moslem-dominated northern part of Jammu-Kashmir state and the center of the secessionist campaign.
Sentence ID 163: 0.00908943 - Meanwhile, 

In [39]:
# Create output directory if it doesn't exist
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

# Get the input file name without the extension
input_filename = os.path.splitext(os.path.basename(file_path))[0]

# Define the output file path
output_file_path = os.path.join(output_dir, f'{input_filename}_commonword')

# Write the top sentences to the output file in the desired format
with open(output_file_path, 'w') as outfile:
    for sentence_id in summary_sentences:
        doc_id = sentences_dict[sentence_id]['doc_id']
        wdcount = sentences_dict[sentence_id]['wdcount']
        num = sentences_dict[sentence_id]['num']
        sentence_text = sentences_dict[sentence_id]['sentence_text']
        # Reconstruct the original sentence tag format
        outfile.write(f'<s doc_id="{doc_id}" num="{num}" wdcount="{wdcount}"> {sentence_text}</s>\n')

print(f"\nTop 10% sentences written to {output_file_path}")


Top 10% sentences written to output/d114h_commonword


In [40]:
# Evaluate the result with the summay file has given in DUC_SUM folder

preference_sum_path = "DUC_SUM/d112h"  
with open(preference_sum_path, "r", encoding="utf-8") as file:
    preference_sum_file = file.read()

preference_sum_dict = parse_doc(preference_sum_file)
# compare the summary_sentences with the preference summary 



In [41]:
# Get the set of (doc_id, num) for sentences in preference_sum_dict
preference_keys = set((v['doc_id'], v['num']) for v in preference_sum_dict.values())

# Count how many summary_sentences are present in preference_sum_dict by (doc_id, num)
matched = 0
for sid in summary_sentences:
    sent = sentences_dict[sid]
    if (sent['doc_id'], sent['num']) in preference_keys:
        matched += 1

percentage = (matched / len(preference_sum_dict)) * 100 if preference_sum_dict else 0
print(f"{matched} out of {len(preference_sum_dict)} summary sentences are in the preference summary ({percentage:.2f}%)")

0 out of 11 summary sentences are in the preference summary (0.00%)
