In [57]:
import os
import re
import numpy as np

Read file from DUC_TEXT/test
Create a dictionary save all the metadata of sentences
Create unique sentence_id

In [58]:
# Read file from DUC_TEXT/test

file_path = "DUC_TEXT/test/d118i"  

with open(file_path, "r", encoding="utf-8") as file:
    doc_file = file.read()

# print("Read file from DUC_TEXT/test")
# print(doc_file)

In [59]:
# Create a dictionary save all the metadata of sentences
def parse_doc (doc_file):
    """
    Parse the document file and extract sentences metadata.
    
    Args:
        doc_file (str): The content of the document file.
        
    Returns:
        dict: A dictionary with sentence_id as keys and metadata as values.
    """
    sentences_dict = {}
    # Create unique sentence_id
    # Initialize sentence_id to 0
    sentence_id = 0

    # Find all sentences tags and their content in the document
    sentence_matches = re.findall(r'<s\s+docid="([^"]+)"\s+num="([^"]+)"\s+wdcount="([^"]+)">\s*(.*?)\s*</s>', doc_file, re.DOTALL)

    for doc_id, num, wdcount, sentence_text in sentence_matches:
        sentences_dict[sentence_id] = {
            "doc_id": doc_id,
            "num": num,
            "wdcount": int(wdcount),
            "sentence_text": sentence_text.strip()
        }
        sentence_id += 1    
    return sentences_dict

sentences_dict = parse_doc(doc_file)
# Print the sentences dictionary
print(sentences_dict)
# for sid, metadata in sentences_dict.items():
#     print(f"Sentence ID: {sid}, Metadata: {metadata}")
# print(f"Total sentences processed: {len(sentences_dict)}")

{0: {'doc_id': 'AP890105-0224', 'num': '7', 'wdcount': 19, 'sentence_text': 'Retired Sen. John Tower, the defense secretary-designate, underwent surgery Thursday to remove a colon polyp, a hospital spokesman said.'}, 1: {'doc_id': 'AP890105-0224', 'num': '8', 'wdcount': 18, 'sentence_text': "``Senator Tower is resting comfortably and is listed in fair condition,'' hospital spokesman Steve Habgood said Thursday night."}, 2: {'doc_id': 'AP890105-0224', 'num': '9', 'wdcount': 9, 'sentence_text': "``The senator is expected to make a full recovery''."}, 3: {'doc_id': 'AP890105-0224', 'num': '10', 'wdcount': 32, 'sentence_text': 'Tower, 63, a U.S. senator from Texas for 24 years, was admitted to Baylor University Medical Center on Wednesday after undergoing a colonscopy examination last week that revealed the polyp, Habgood said.'}, 4: {'doc_id': 'AP890105-0224', 'num': '11', 'wdcount': 23, 'sentence_text': "A preliminary biopsy of the polyp showed it was benign, Habgood said, but further re

In [60]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import json

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

# Create a new dictionary with only sentence_id and fulltext
sentence_text_dict = {
    sentence_id: data['sentence_text']
    for sentence_id, data in sentences_dict.items()
}

# Initialize the stopword list and Lemmatizer
stop_words = set(stopwords.words('english'))
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text into words
    words = text.split()
    # Remove stopwords and stem words
    # processed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Lemmatize words instead of stemming
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
   
    # Join the processed words back into a string
    return " ".join(processed_words)

# Process each sentence in the new dictionary
processed_sentence_text_dict = {
    sentence_id: preprocess_text(text)
    for sentence_id, text in sentence_text_dict.items()
}

# Optional: Print the processed dictionary to verify
print(json.dumps(processed_sentence_text_dict, indent=2))

{
  "0": "retired sen john tower defense secretarydesignate underwent surgery thursday remove colon polyp hospital spokesman said",
  "1": "senator tower resting comfortably listed fair condition hospital spokesman steve habgood said thursday night",
  "2": "senator expected make full recovery",
  "3": "tower 63 u senator texas 24 year admitted baylor university medical center wednesday undergoing colonscopy examination last week revealed polyp habgood said",
  "4": "preliminary biopsy polyp showed benign habgood said result would announced friday tower surgeon dr",
  "5": "rd dignan dallas",
  "6": "thursday surgery took approximately three hour habgood said",
  "7": "tower underwent colonscopy examination rectal polyp also discovered removed",
  "8": "test growth showed contain malignant well differentiated cell habgood said",
  "9": "explanation first polyp removal would come tower surgeon unavailable comment thursday night spokesman said",
  "10": "leaving senate 1985 tower went bu

In [61]:


# Calculate TF-IDF for each sentence without using scikit-learn

# Step 1: Calculate term frequency (TF) for each sentence
tf_dict = {}
for sentence_id, text in processed_sentence_text_dict.items():
    words = text.split()
    tf = {}
    for word in words:
        tf[word] = tf.get(word, 0) + 1
    # Normalize by total words in the sentence
    total_words = len(words)
    if total_words > 0:
        for word in tf:
            tf[word] /= total_words
    tf_dict[sentence_id] = tf
# print("Term Frequency (TF) for each sentence:")
# print(json.dumps(tf_dict, indent=2))

# Step 2: Calculate document frequency (DF) for each word
df = {}
for tf in tf_dict.values():
    for word in tf:
        df[word] = df.get(word, 0) + 1
# print("Document Frequency (DF) for each word:")
# print(json.dumps(df, indent=2))

# Step 3: Calculate inverse document frequency (IDF)
N = len(processed_sentence_text_dict)
idf = {}
for word, freq in df.items():
    idf[word] = np.log(N / (freq))

# Step 4: Calculate TF-IDF for each sentence
tf_idf_sentence_dict = {}
for sentence_id, tf in tf_dict.items():
    tf_idf = {}
    for word, tf_value in tf.items():
        tf_idf[word] = tf_value * idf[word]
    tf_idf_sentence_dict[sentence_id] = tf_idf

# Optional: print a sample
# print(json.dumps(tf_idf_sentence_dict, indent=2))

In [62]:
# Create TF-IDF vector space (matrix) for all sentences
# 1. Get all unique words across all sentences
all_words = set()
for tfidf in tf_idf_sentence_dict.values():
    all_words.update(tfidf.keys())
all_words = sorted(all_words)  # consistent order

# 2. Build a matrix: rows=sentences, columns=words, values=tf-idf
word_index = {word: idx for idx, word in enumerate(all_words)}
num_sentences = len(tf_idf_sentence_dict)
num_words = len(all_words)
tf_idf_matrix = np.zeros((num_sentences, num_words))

for sent_id, tfidf in tf_idf_sentence_dict.items():
    for word, value in tfidf.items():
        idx = word_index[word]
        tf_idf_matrix[sent_id, idx] = float(value)

# tf_idf_matrix is now ready for cosine similarity calculation
# print("TF-IDF matrix shape:", tf_idf_matrix.shape)
# print the first 5 rows of the TF-IDF matrix
# print("First 5 rows of the TF-IDF matrix:")
# print(tf_idf_matrix[:5])

In [63]:
# Calculate cosine similarity between all pairs of sentences using the TF-IDF matrix

def cosine_similarity_matrix(matrix):
    # Normalize each row (sentence vector) to unit length
    norm = np.linalg.norm(matrix, axis=1, keepdims=True)
    # Avoid division by zero
    norm[norm == 0] = 1
    normalized_matrix = matrix / norm
    # Cosine similarity is the dot product of normalized vectors
    similarity = np.dot(normalized_matrix, normalized_matrix.T)
    return similarity

cosine_sim_matrix = cosine_similarity_matrix(tf_idf_matrix)
# Optional: print a sample of the similarity matrix
# print("Cosine similarity matrix shape:", cosine_sim_matrix.shape)
# print("First 5x5 block of the cosine similarity matrix:\n", cosine_sim_matrix[:5, :5])

In [64]:
# Define a threshold for cosine similarity to create connections
# threshold = 0.2  # You can adjust this value as needed

# Create a boolean connection matrix: 1 if similarity > threshold and not self, else 0
# connection_matrix = (cosine_sim_matrix > threshold).astype(int)
# np.fill_diagonal(connection_matrix, 0)  # Remove self-connections

# print("Connection matrix shape:", connection_matrix.shape)
# print("First 5x5 block of the connection matrix:\n", connection_matrix[:5, :5])

In [65]:
connection_matrix = cosine_sim_matrix
np.fill_diagonal(connection_matrix, 0)  # Remove self-connections
print(connection_matrix)

[[0.         0.19727112 0.         ... 0.         0.         0.        ]
 [0.19727112 0.         0.03951242 ... 0.         0.         0.        ]
 [0.         0.03951242 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


Calculate pageRank score
The PageRank formula is defined as:

$$
PR(i) = \frac{1 - d}{N} + d \sum_{j \in M(i)} \frac{PR(j)}{deg(j)}
$$

Where:

- $PR(i)$ is the PageRank of node $i$
- $d$ is the damping factor (usually 0.85)
- $N$ is the total number of nodes
- $M(i)$ are nodes linking to $i$
- $L(j)$ is the number of links from node $j$

In [66]:
# Calculate pagerank score
def calculate_pagerank(connection_matrix, d=0.85, max_iterations=100, tolerance=1e-6):    
  num_nodes = len(connection_matrix)
  
  pagerank_scores = np.ones(num_nodes)  # Initialize all PR scores to 1

  connection_matrix = np.array(connection_matrix) # Convert list to numpy array
  transition_matrix = np.zeros_like(connection_matrix, dtype=float) # Initialize transition matrix as numpy array
  row_sum = np.sum(connection_matrix, axis=1)

  for i in range  (num_nodes):
    if row_sum[i] > 0:
      transition_matrix[i,:] = connection_matrix[i,:] / row_sum[i]
    # if row_sum is 0, the row in transition_matrix remains 0

  # print(connection_matrix)
  # print(row_sum)
  # print(pagerank)

  for iteration in range(max_iterations):
    # PageRank formula: PR(A) = (1-d)/N + d * sum(PR(B)/L(B)) for all pages B pointing to A
    # In matrix form: new_pagerank = (1-d)/N + d * transition_matrix.T @ pagerank
    # new_pagerank = np.full(num_nodes, (1 - damping_factor) / num_nodes) + \
    #                    damping_factor * np.dot(pagerank, transition_matrix) # Corrected matrix multiplication order
    new_pagerank_scores = (1 - d) / num_nodes + d * transition_matrix.T @ pagerank_scores
    # Check for convergence
    if np.linalg.norm(new_pagerank_scores - pagerank_scores, ord=1) < tolerance:
            print(f"PageRank converged after {iteration + 1} iterations.")
            break
    # print(f'Interation {iteration+1}')
    # print(new_pagerank)
    pagerank_scores = new_pagerank_scores

  return pagerank_scores
pagerank_scores = calculate_pagerank(connection_matrix)
# Print the PageRank scores
# print("PageRank Scores:")
# for i, score in enumerate(pagerank_scores):
#     print(f"Sentence ID {i}: {score:.4f} - {sentences_dict[i]['sentence_text']}")    
    
                                    
    

Get the 10% highest score sentences to create a summary

In [67]:
# Get the 10% highest score sentences to create a summary
summary_sentences = sorted(range(len(pagerank_scores)), key=lambda i: pagerank_scores[i], reverse=True)[:int(0.1 * len(pagerank_scores))]
# print length of summary sentences
print(f"Number of summary sentences: {len(summary_sentences)}")

# print("Summary Sentences:")
# for i in summary_sentences:
#     print(f"Sentence ID {i}: {pagerank_scores[i]:.8f} - {sentences_dict[i]['sentence_text']}")
# summary = [sentence_dict[i]['sentence_text'] for i in summary_sentences]
# print("\nSummary Sentences:")
# for sentence in summary:
#     print(sentence)


Number of summary sentences: 37


In [68]:
# Create output directory if it doesn't exist
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

# Get the input file name without the extension
input_filename = os.path.splitext(os.path.basename(file_path))[0]

# Define the output file path
output_file_path = os.path.join(output_dir, f'{input_filename}_cosine')

# Write the top sentences to the output file in the desired format
with open(output_file_path, 'w') as outfile:
    for sentence_id in summary_sentences:
        doc_id = sentences_dict[sentence_id]['doc_id']
        wdcount = sentences_dict[sentence_id]['wdcount']
        num = sentences_dict[sentence_id]['num']
        sentence_text = sentences_dict[sentence_id]['sentence_text']
        # Reconstruct the original sentence tag format
        outfile.write(f'<s doc_id="{doc_id}" num="{num}" wdcount="{wdcount}"> {sentence_text}</s>\n')

print(f"\nTop 10% sentences written to {output_file_path}")


Top 10% sentences written to output/d118i_cosine


In [69]:
# Evaluate the result with the summay file has given in DUC_SUM folder

preference_sum_path = "DUC_SUM/d118i"  
with open(preference_sum_path, "r", encoding="utf-8") as file:
    preference_sum_file = file.read()

preference_sum_dict = parse_doc(preference_sum_file)
print("Preference Summary Sentences:")
for sid, metadata in preference_sum_dict.items():
    print(f"Sentence ID: {sid}, Metadata: {metadata}")  




Preference Summary Sentences:
Sentence ID: 0, Metadata: {'doc_id': 'SJMN91-06097121', 'num': '7', 'wdcount': 28, 'sentence_text': 'Former Sen. John Tower of Texas and his daughter were among 23 people killed Friday when the twin-engine commuter plane they were aboard crashed while trying to land.'}
Sentence ID: 1, Metadata: {'doc_id': 'LA020189-0050', 'num': '19', 'wdcount': 26, 'sentence_text': 'Tower, 63 and twice-divorced, served 24 years in the Senate from Texas, including a tenure as chairman of the Armed Services Committee from 1981 to 1984.'}
Sentence ID: 2, Metadata: {'doc_id': 'AP890309-0143', 'num': '11', 'wdcount': 29, 'sentence_text': 'The Democratic-controlled Senate today rejected the nomination of former Texas Sen. John Tower as defense secretary, delivering a major rebuke to President Bush just 49 days into his term.'}
Sentence ID: 3, Metadata: {'doc_id': 'AP890309-0143', 'num': '13', 'wdcount': 33, 'sentence_text': "It was only the ninth time in 200 years that the Sena

In [70]:
# Get the set of (doc_id, num) for sentences in preference_sum_dict
preference_keys = set((v['doc_id'], v['num']) for v in preference_sum_dict.values())

print(f"Preference keys: {preference_keys}")  

# Count how many summary_sentences are present in preference_sum_dict by (doc_id, num)
matched = 0
for sid in summary_sentences:
    sent = sentences_dict[sid]
    # print(f"Checking sentence ID {sid}: {sent['doc_id']}, {sent['num']}, {sent['sentence_text']}")
    if (sent['doc_id'], sent['num']) in preference_keys:
        print(f"Matched: {sent['doc_id']}, {sent['num']}, {sent['sentence_text']}")
        matched += 1
# Print the number of matched sentences
print (f"Number of matched sentences: {matched}")
# Calculate recall and precision
recall_percentage = (matched / len(preference_sum_dict)) * 100 if preference_sum_dict else 0
precision_percentage = (matched / len(summary_sentences)) * 100 if summary_sentences else 0
# Print recall and precision
print(f"Recall: {recall_percentage:.2f}%")
print(f"Precision: {precision_percentage:.2f}%")
# Calculate F1 score
f1_percentage = (2 * recall_percentage * precision_percentage) / (recall_percentage + precision_percentage) if (recall_percentage + precision_percentage) > 0 else 0
print(f"F1 Score: {f1_percentage:.2f}%")
# print(f"{matched} out of {len(preference_sum_dict)} summary sentences are in the preference summary ({percentage:.2f}%)")

Preference keys: {('SJMN91-06097121', '7'), ('LA020489-0035', '42'), ('AP890211-0110', '12'), ('LA020189-0050', '20'), ('LA020189-0050', '19'), ('AP890316-0012', '37'), ('AP890309-0143', '11'), ('LA021189-0003', '58'), ('AP890316-0012', '36'), ('AP890309-0143', '13'), ('LA021189-0003', '19'), ('LA021189-0003', '20')}
Matched: AP890211-0110, 12, The Senate Armed Services Committee has put the nomination of Tower, a former Texas senator and onetime chairman of the committee, on hold while it takes a second look at Tower's personal habits, including his use of alcohol, and his links to defense contractors.
Matched: AP890309-0143, 11, The Democratic-controlled Senate today rejected the nomination of former Texas Sen. John Tower as defense secretary, delivering a major rebuke to President Bush just 49 days into his term.
Matched: LA020189-0050, 19, Tower, 63 and twice-divorced, served 24 years in the Senate from Texas, including a tenure as chairman of the Armed Services Committee from 1981