In [43]:
import os
import re
import numpy as np

Read file from DUC_TEXT/test
Create a dictionary save all the metadata of sentences
Create unique sentence_id

In [44]:
# Read file from DUC_TEXT/test

file_path = "DUC_TEXT/test/d113h"  

with open(file_path, "r", encoding="utf-8") as file:
    doc_file = file.read()

# print("Read file from DUC_TEXT/test")
# print(doc_file)

In [45]:
# Create a dictionary save all the metadata of sentences
def parse_doc (doc_file):
    """
    Parse the document file and extract sentences metadata.
    
    Args:
        doc_file (str): The content of the document file.
        
    Returns:
        dict: A dictionary with sentence_id as keys and metadata as values.
    """
    sentences_dict = {}
    # Create unique sentence_id
    # Initialize sentence_id to 0
    sentence_id = 0

    # Find all sentences tags and their content in the document
    sentence_matches = re.findall(r'<s\s+docid="([^"]+)"\s+num="([^"]+)"\s+wdcount="([^"]+)">\s*(.*?)\s*</s>', doc_file, re.DOTALL)

    for doc_id, num, wdcount, sentence_text in sentence_matches:
        sentences_dict[sentence_id] = {
            "doc_id": doc_id,
            "num": num,
            "wdcount": int(wdcount),
            "sentence_text": sentence_text.strip()
        }
        sentence_id += 1    
    return sentences_dict

sentences_dict = parse_doc(doc_file)
# Print the sentences dictionary
print(sentences_dict)
# for sid, metadata in sentences_dict.items():
#     print(f"Sentence ID: {sid}, Metadata: {metadata}")
# print(f"Total sentences processed: {len(sentences_dict)}")

{0: {'doc_id': 'FT931-2858', 'num': '6', 'wdcount': 24, 'sentence_text': 'The 29-storey Jijobhoy Towers, home of the Bombay Stock Exchange, was rocked by two powerful bombs last Friday, leaving 60 dead and 200 injured.'}, 1: {'doc_id': 'FT931-2858', 'num': '7', 'wdcount': 12, 'sentence_text': 'The market reopened on Monday in an attempt to return to normal.'}, 2: {'doc_id': 'FT931-2858', 'num': '8', 'wdcount': 30, 'sentence_text': 'It has had to compromise - trading has been restricted to an hour a day, and dealing has moved back to the traditional ring, where it stopped a year ago.'}, 3: {'doc_id': 'FT931-2858', 'num': '9', 'wdcount': 17, 'sentence_text': 'Dealers seemed to like this, the BSE index rising by 5.6 per cent on Monday and Tuesday.'}, 4: {'doc_id': 'FT931-2858', 'num': '10', 'wdcount': 23, 'sentence_text': 'But another bomb blast in Calcutta triggered nervous selling yesterday, and the index came back by 50.62, or 2.1 per cent, to 2,409.23.'}, 5: {'doc_id': 'FT931-2858', '

In [46]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import json

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

# Create a new dictionary with only sentence_id and fulltext
sentence_text_dict = {
    sentence_id: data['sentence_text']
    for sentence_id, data in sentences_dict.items()
}

# Initialize the stopword list and Lemmatizer
stop_words = set(stopwords.words('english'))
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text into words
    words = text.split()
    # Remove stopwords and stem words
    # processed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Lemmatize words instead of stemming
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
   
    # Join the processed words back into a string
    return " ".join(processed_words)

# Process each sentence in the new dictionary
processed_sentence_text_dict = {
    sentence_id: preprocess_text(text)
    for sentence_id, text in sentence_text_dict.items()
}

# Optional: Print the processed dictionary to verify
print(json.dumps(processed_sentence_text_dict, indent=2))

{
  "0": "29storey jijobhoy tower home bombay stock exchange rocked two powerful bomb last friday leaving 60 dead 200 injured",
  "1": "market reopened monday attempt return normal",
  "2": "compromise trading restricted hour day dealing moved back traditional ring stopped year ago",
  "3": "dealer seemed like bse index rising 56 per cent monday tuesday",
  "4": "another bomb blast calcutta triggered nervous selling yesterday index came back 5062 21 per cent 240923",
  "5": "taken place volatile background bombay equity market past six week contend first unusually bullish speculation national budget february 27 subsequently savage expression disappointment",
  "6": "dr manmohan singh indian finance minister disappointed trader budget end last month",
  "7": "postponed corporate tax reform year denying hope tax cut recommended expert group",
  "8": "reduce excise import duty broad front left cement steel sector share price boosted prebudget speculation",
  "9": "top dr singh refused equ

In [47]:


# Calculate TF-IDF for each sentence without using scikit-learn

# Step 1: Calculate term frequency (TF) for each sentence
tf_dict = {}
for sentence_id, text in processed_sentence_text_dict.items():
    words = text.split()
    tf = {}
    for word in words:
        tf[word] = tf.get(word, 0) + 1
    # Normalize by total words in the sentence
    total_words = len(words)
    if total_words > 0:
        for word in tf:
            tf[word] /= total_words
    tf_dict[sentence_id] = tf
# print("Term Frequency (TF) for each sentence:")
# print(json.dumps(tf_dict, indent=2))

# Step 2: Calculate document frequency (DF) for each word
df = {}
for tf in tf_dict.values():
    for word in tf:
        df[word] = df.get(word, 0) + 1
# print("Document Frequency (DF) for each word:")
# print(json.dumps(df, indent=2))

# Step 3: Calculate inverse document frequency (IDF)
N = len(processed_sentence_text_dict)
idf = {}
for word, freq in df.items():
    idf[word] = np.log(N / (freq))

# Step 4: Calculate TF-IDF for each sentence
tf_idf_sentence_dict = {}
for sentence_id, tf in tf_dict.items():
    tf_idf = {}
    for word, tf_value in tf.items():
        tf_idf[word] = tf_value * idf[word]
    tf_idf_sentence_dict[sentence_id] = tf_idf

# Optional: print a sample
# print(json.dumps(tf_idf_sentence_dict, indent=2))

In [48]:
# Create TF-IDF vector space (matrix) for all sentences
# 1. Get all unique words across all sentences
all_words = set()
for tfidf in tf_idf_sentence_dict.values():
    all_words.update(tfidf.keys())
all_words = sorted(all_words)  # consistent order

# 2. Build a matrix: rows=sentences, columns=words, values=tf-idf
word_index = {word: idx for idx, word in enumerate(all_words)}
num_sentences = len(tf_idf_sentence_dict)
num_words = len(all_words)
tf_idf_matrix = np.zeros((num_sentences, num_words))

for sent_id, tfidf in tf_idf_sentence_dict.items():
    for word, value in tfidf.items():
        idx = word_index[word]
        tf_idf_matrix[sent_id, idx] = float(value)

# tf_idf_matrix is now ready for cosine similarity calculation
# print("TF-IDF matrix shape:", tf_idf_matrix.shape)
# print the first 5 rows of the TF-IDF matrix
# print("First 5 rows of the TF-IDF matrix:")
# print(tf_idf_matrix[:5])

In [49]:
# Calculate cosine similarity between all pairs of sentences using the TF-IDF matrix

def cosine_similarity_matrix(matrix):
    # Normalize each row (sentence vector) to unit length
    norm = np.linalg.norm(matrix, axis=1, keepdims=True)
    # Avoid division by zero
    norm[norm == 0] = 1
    normalized_matrix = matrix / norm
    # Cosine similarity is the dot product of normalized vectors
    similarity = np.dot(normalized_matrix, normalized_matrix.T)
    return similarity

cosine_sim_matrix = cosine_similarity_matrix(tf_idf_matrix)
# Optional: print a sample of the similarity matrix
# print("Cosine similarity matrix shape:", cosine_sim_matrix.shape)
# print("First 5x5 block of the cosine similarity matrix:\n", cosine_sim_matrix[:5, :5])

In [50]:
# Define a threshold for cosine similarity to create connections
threshold = 0.2  # You can adjust this value as needed

# Create a boolean connection matrix: 1 if similarity > threshold and not self, else 0
connection_matrix = (cosine_sim_matrix > threshold).astype(int)
np.fill_diagonal(connection_matrix, 0)  # Remove self-connections

# print("Connection matrix shape:", connection_matrix.shape)
# print("First 5x5 block of the connection matrix:\n", connection_matrix[:5, :5])

Calculate pageRank score
The PageRank formula is defined as:

$$
PR(i) = \frac{1 - d}{N} + d \sum_{j \in M(i)} \frac{PR(j)}{deg(j)}
$$

Where:

- $PR(i)$ is the PageRank of node $i$
- $d$ is the damping factor (usually 0.85)
- $N$ is the total number of nodes
- $M(i)$ are nodes linking to $i$
- $L(j)$ is the number of links from node $j$

In [51]:
# Calculate pagerank score
def calculate_pagerank(connection_matrix, d=0.85, max_iterations=100, tolerance=1e-6):    
  num_nodes = len(connection_matrix)
  
  pagerank_scores = np.ones(num_nodes)  # Initialize all PR scores to 1

  connection_matrix = np.array(connection_matrix) # Convert list to numpy array
  transition_matrix = np.zeros_like(connection_matrix, dtype=float) # Initialize transition matrix as numpy array
  row_sum = np.sum(connection_matrix, axis=1)

  for i in range  (num_nodes):
    if row_sum[i] > 0:
      transition_matrix[i,:] = connection_matrix[i,:] / row_sum[i]
    # if row_sum is 0, the row in transition_matrix remains 0

  # print(connection_matrix)
  # print(row_sum)
  # print(pagerank)

  for iteration in range(max_iterations):
    # PageRank formula: PR(A) = (1-d)/N + d * sum(PR(B)/L(B)) for all pages B pointing to A
    # In matrix form: new_pagerank = (1-d)/N + d * transition_matrix.T @ pagerank
    # new_pagerank = np.full(num_nodes, (1 - damping_factor) / num_nodes) + \
    #                    damping_factor * np.dot(pagerank, transition_matrix) # Corrected matrix multiplication order
    new_pagerank_scores = (1 - d) / num_nodes + d * transition_matrix.T @ pagerank_scores
    # Check for convergence
    if np.linalg.norm(new_pagerank_scores - pagerank_scores, ord=1) < tolerance:
            print(f"PageRank converged after {iteration + 1} iterations.")
            break
    # print(f'Interation {iteration+1}')
    # print(new_pagerank)
    pagerank_scores = new_pagerank_scores

  return pagerank_scores
pagerank_scores = calculate_pagerank(connection_matrix)
# Print the PageRank scores
# print("PageRank Scores:")
# for i, score in enumerate(pagerank_scores):
#     print(f"Sentence ID {i}: {score:.4f} - {sentences_dict[i]['sentence_text']}")    
    
                                    
    

Get the 10% highest score sentences to create a summary

In [52]:
# Get the 10% highest score sentences to create a summary
summary_sentences = sorted(range(len(pagerank_scores)), key=lambda i: pagerank_scores[i], reverse=True)[:int(0.1 * len(pagerank_scores))]
# print length of summary sentences
print(f"Number of summary sentences: {len(summary_sentences)}")

# print("Summary Sentences:")
# for i in summary_sentences:
#     print(f"Sentence ID {i}: {pagerank_scores[i]:.8f} - {sentences_dict[i]['sentence_text']}")
# summary = [sentence_dict[i]['sentence_text'] for i in summary_sentences]
# print("\nSummary Sentences:")
# for sentence in summary:
#     print(sentence)


Number of summary sentences: 15


In [53]:
# Create output directory if it doesn't exist
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

# Get the input file name without the extension
input_filename = os.path.splitext(os.path.basename(file_path))[0]

# Define the output file path
output_file_path = os.path.join(output_dir, f'{input_filename}_cosine')

# Write the top sentences to the output file in the desired format
with open(output_file_path, 'w') as outfile:
    for sentence_id in summary_sentences:
        doc_id = sentences_dict[sentence_id]['doc_id']
        wdcount = sentences_dict[sentence_id]['wdcount']
        num = sentences_dict[sentence_id]['num']
        sentence_text = sentences_dict[sentence_id]['sentence_text']
        # Reconstruct the original sentence tag format
        outfile.write(f'<s doc_id="{doc_id}" num="{num}" wdcount="{wdcount}"> {sentence_text}</s>\n')

print(f"\nTop 10% sentences written to {output_file_path}")


Top 10% sentences written to output/d113h_cosine


In [54]:
# Evaluate the result with the summay file has given in DUC_SUM folder

preference_sum_path = "DUC_SUM/d113h"  
with open(preference_sum_path, "r", encoding="utf-8") as file:
    preference_sum_file = file.read()

preference_sum_dict = parse_doc(preference_sum_file)
print("Preference Summary Sentences:")
for sid, metadata in preference_sum_dict.items():
    print(f"Sentence ID: {sid}, Metadata: {metadata}")  




Preference Summary Sentences:
Sentence ID: 0, Metadata: {'doc_id': 'FT931-3664', 'num': '4', 'wdcount': 23, 'sentence_text': "FT 15 MAR 93 / Bombay stays calm in the face of horror: The city seems determined to put Friday's outrage behind it"}
Sentence ID: 1, Metadata: {'doc_id': 'FT931-3764', 'num': '6', 'wdcount': 31, 'sentence_text': 'THE INDIAN government flew federal paramilitary troops into Bombay last night after more than a dozen bombs exploded in the city, killing about 200 people and injuring more than a thousand.'}
Sentence ID: 2, Metadata: {'doc_id': 'FT931-2858', 'num': '6', 'wdcount': 24, 'sentence_text': 'The 29-storey Jijobhoy Towers, home of the Bombay Stock Exchange, was rocked by two powerful bombs last Friday, leaving 60 dead and 200 injured.'}
Sentence ID: 3, Metadata: {'doc_id': 'FT931-2858', 'num': '7', 'wdcount': 12, 'sentence_text': 'The market reopened on Monday in an attempt to return to normal.'}
Sentence ID: 4, Metadata: {'doc_id': 'FT931-2858', 'num': '9',

In [55]:
# Get the set of (doc_id, num) for sentences in preference_sum_dict
preference_keys = set((v['doc_id'], v['num']) for v in preference_sum_dict.values())

print(f"Preference keys: {preference_keys}")  

# Count how many summary_sentences are present in preference_sum_dict by (doc_id, num)
matched = 0
for sid in summary_sentences:
    sent = sentences_dict[sid]
    # print(f"Checking sentence ID {sid}: {sent['doc_id']}, {sent['num']}, {sent['sentence_text']}")
    if (sent['doc_id'], sent['num']) in preference_keys:
        print(f"Matched: {sent['doc_id']}, {sent['num']}, {sent['sentence_text']}")
        matched += 1
# Print the number of matched sentences
print (f"Number of matched sentences: {matched}")
# Calculate recall and precision
recall_percentage = (matched / len(preference_sum_dict)) * 100 if preference_sum_dict else 0
precision_percentage = (matched / len(summary_sentences)) * 100 if summary_sentences else 0
# Print recall and precision
print(f"Recall: {recall_percentage:.2f}%")
print(f"Precision: {precision_percentage:.2f}%")
# Calculate F1 score
f1_percentage = (2 * recall_percentage * precision_percentage) / (recall_percentage + precision_percentage) if (recall_percentage + precision_percentage) > 0 else 0
print(f"F1 Score: {f1_percentage:.2f}%")
# print(f"{matched} out of {len(preference_sum_dict)} summary sentences are in the preference summary ({percentage:.2f}%)")

Preference keys: {('FT931-4176', '8'), ('FT931-2858', '28'), ('FT931-3664', '4'), ('FT931-3764', '12'), ('FT931-2981', '16'), ('FT931-2858', '6'), ('FT931-2858', '16'), ('FT931-2858', '20'), ('FT931-3764', '6'), ('FT931-2858', '10'), ('FT931-2858', '9'), ('FT931-2858', '7'), ('FT931-2858', '12'), ('FT931-2858', '11')}
Matched: FT931-2858, 16, He lowered the tax to 30 per cent from 65 per cent for expatriate Indians and foreign investors but for local investors it remained at 44.5 per cent.
Matched: FT931-2858, 12, Dr Manmohan Singh, the Indian finance minister, disappointed traders in the budget at the end of last month.
Number of matched sentences: 2
Recall: 14.29%
Precision: 13.33%
F1 Score: 13.79%
