In [9]:
from google.colab import drive
drive.mount('/content/mydrive')

Mounted at /content/mydrive


In [10]:
import os

repo_path = '/content/optimized-summarization'
if not os.path.exists(repo_path):
    !git clone https://github.com/srinisvas/optimized-summarization.git
else:
    print("Repo already exists, skipping clone.")

# Check files inside
os.listdir(repo_path)


Repo already exists, skipping clone.


['.git', '.idea', 'optimized-summarization', 'README.md']

In [23]:
import os
  # Adjust your folder
INPUT_DIR = '/content/optimized-summarization/optimized-summarization/Normalized-papers/'
TF_IDF_OUTPUT_DIR = '/content/drive/MyDrive/TF-ID_similarity_matrix/'
TEXTRANK_OUTPUT_DIR = '/content/drive/MyDrive/Textrank_scores/'
BERTSUM_OUTPUT_DIR = '/content/drive/MyDrive/BERTSUM_final_selection/'

os.makedirs(TF_IDF_OUTPUT_DIR, exist_ok=True)
os.makedirs(TEXTRANK_OUTPUT_DIR, exist_ok=True)

In [12]:
import json
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        paper = json.load(f)
    sentences = []
    for sec in paper['sections']:
        doc = nlp(sec['content'])
        sentences.extend([sent for sent in doc.sents])
    return sentences


In [13]:
def clean_sentences(sentences):
    sentences_cleaned = []
    for sent in sentences:
        words = [word.text for word in sent if not word.is_punct]
        sentences_cleaned.append(" ".join(words))
    return sentences_cleaned


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_tfidf_similarity(sentences_cleaned):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences_cleaned)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return tfidf_matrix, similarity_matrix


In [15]:
import networkx as nx

def compute_textrank(similarity_matrix, damping=0.85, max_iter=100):
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph, alpha=damping, max_iter=max_iter)
    # Convert to list sorted by sentence index
    textrank_scores = [scores[i] for i in range(len(scores))]
    return textrank_scores


In [29]:
def mmr_selection(sentences, textrank_scores, similarity_matrix, K=10, lam=0.7):
    """
    Selects K sentences using Maximal Marginal Relevance (MMR) to balance relevance
    (TextRank) and diversity (similarity to already selected sentences).
    """
    if not sentences or len(sentences) == 0:
        return []

    # Ensure K does not exceed the number of available sentences
    K = min(K, len(sentences))

    selected = []
    candidate_idx = list(range(len(sentences)))

    for _ in range(K):
        mmr_score = []
        for i in candidate_idx:
            if not selected:
                # First iteration: use pure TextRank score
                mmr_score.append((i, textrank_scores[i]))
            else:
                # Subsequent iterations: calculate MMR
                max_sim = max(similarity_matrix[i][j] for j in selected)
                mmr_val = lam * textrank_scores[i] - (1-lam) * max_sim
                mmr_score.append((i, mmr_val))

        # Select the sentence with the highest MMR score
        mmr_score.sort(key=lambda x: x[1], reverse=True)
        best_idx = mmr_score[0][0]

        selected.append(best_idx)
        candidate_idx.remove(best_idx)

    # Return the *original* sentences (as strings) based on the selected indices
    selected_sentences = [str(sentences[i]) for i in selected]
    return selected_sentences

# --- NEW BERTSUM FUNCTION FOR FINAL RERANKING ---

# Constant for the final number of sentences (e.g., summary size)
FINAL_SUMMARY_SENTENCES = 20


In [17]:
!pip install bert-extractive-summarizer transformers



In [30]:
# Process all files
for file_name in os.listdir(INPUT_DIR):
    if file_name.endswith(".json"):
        file_path = os.path.join(INPUT_DIR, file_name)

        # Extract and clean sentences

        sentences = extract_sentences(file_path)
        sentences_cleaned = clean_sentences(sentences)

        # TF-IDF and similarity
        tfidf_matrix, similarity_matrix = compute_tfidf_similarity(sentences_cleaned)

        # Save TF-IDF similarity matrix
        tfidf_file = os.path.join(TF_IDF_OUTPUT_DIR, file_name.replace('.json', '_similarity.npy'))
        np.save(tfidf_file, similarity_matrix)



        # Compute TextRank scores
        textrank_scores = compute_textrank(similarity_matrix)

        # Save TextRank scores
        tr_file = os.path.join(TEXTRANK_OUTPUT_DIR, file_name.replace('.json', '_TextRank.txt'))
        with open(tr_file, 'w', encoding='utf-8') as f:
            for score in textrank_scores:
                f.write(f"{score:.6f}\n")

        # 4. MMR Selection (Sparse Input Activation - Diversity Stage)
        # K_mmr sets the size of the candidate pool (e.g., 30% of total sentences)
        K_mmr = max(FINAL_SUMMARY_SENTENCES, int(len(sentences) * 0.30))

        mmr_candidate_sentences = mmr_selection(
            sentences,
            textrank_scores,
            similarity_matrix,
            K=K_mmr,
            lam=0.7 # Balance: 70% relevance, 30% diversity
        )

        print(f"-> TextRank/TFIDF done. MMR selected {len(mmr_candidate_sentences)} candidates.")

        # 5. BERTSum Final Selection (Semantic Validation Stage)
        # Assuming paper_id is defined within the loop
        paper_id = file_name.replace('.json', '')
        bertsum_file = os.path.join(BERTSUM_OUTPUT_DIR, f'{paper_id}_final_extraction.txt')


        final_extracted_sentences = select_final_sentences_bertsum(
            mmr_candidate_sentences,
            final_num_sentences=FINAL_SUMMARY_SENTENCES
        )

        # 6. Save final output
        with open(bertsum_file, 'w', encoding='utf-8') as f:
            f.write("\n".join(final_extracted_sentences))

        print(f"-> BERTSum complete. Final {len(final_extracted_sentences)} sentences saved to {bertsum_file}")

print("\nPipeline execution finished for all documents.")


print(f"Processed {file_name} -> TF-IDF and TextRank saved")

-> TextRank/TFIDF done. MMR selected 25 candidates.
-> BERTSum complete. Final 20 sentences saved to /content/drive/MyDrive/BERTSUM_final_selection/Computer Science Ethics Education in Australia - A Work in Progress_final_extraction.txt
-> TextRank/TFIDF done. MMR selected 72 candidates.
-> BERTSum complete. Final 19 sentences saved to /content/drive/MyDrive/BERTSUM_final_selection/Are science- technology- and engineering now the most important subjects for ethics- Our need to respond_final_extraction.txt
-> TextRank/TFIDF done. MMR selected 21 candidates.
-> BERTSum complete. Final 19 sentences saved to /content/drive/MyDrive/BERTSUM_final_selection/Efficient Framework Approach to Extract Privacy Issues in Cloud Computing_final_extraction.txt
-> TextRank/TFIDF done. MMR selected 30 candidates.
-> BERTSum complete. Final 20 sentences saved to /content/drive/MyDrive/BERTSUM_final_selection/The Ethics of AI in Literature Reflections On Representation and Responsibility_final_extraction.t

In [31]:
# --- Define the paths based on your pipeline file ---
# Path to your BERTSum output folder (copied from pipeline_with_bertsum.py)
BERTSUM_OUTPUT_DIR = '/content/drive/MyDrive/BERTSUM_final_selection/'
output_folder = BERTSUM_OUTPUT_DIR

# Path for the zip file
output_zip = '/content/drive/MyDrive/BERTSUM_final_selection.zip'

# Check if the folder exists before attempting to zip
if not os.path.exists(output_folder):
    print(f"Error: Output folder not found at {output_folder}")
else:
    # Zip the folder. shutil.make_archive takes (base_name, format, root_dir)
    # The base_name is the full path/filename without extension.
    print(f"Zipping folder: {output_folder}...")
    shutil.make_archive(output_zip.replace('.zip',''), 'zip', output_folder)
    print(f"Zip created successfully at {output_zip}")

    # Download the zip
    files.download(output_zip)
    print("Download initiated.")


Zipping folder: /content/drive/MyDrive/BERTSUM_final_selection/...
Zip created successfully at /content/drive/MyDrive/BERTSUM_final_selection.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download initiated.
