In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
import os

repo_path = '/content/optimized-summarization'
if not os.path.exists(repo_path):
    !git clone https://github.com/srinisvas/optimized-summarization.git
else:
    print("Repo already exists, skipping clone.")

# Check files inside
os.listdir(repo_path)


Repo already exists, skipping clone.


['.git',
 'README.md',
 'optimized-summarization',
 'TF-ID_similarity_matrix',
 '.idea']

In [35]:
import os
  # Adjust your folder
INPUT_DIR = '/content/optimized-summarization/optimized-summarization/Normalized-papers/'
TF_IDF_OUTPUT_DIR = '/content/drive/MyDrive/TF-ID_similarity_matrix/'
TEXTRANK_OUTPUT_DIR = '/content/drive/MyDrive/Textrank_scores/'

os.makedirs(TF_IDF_OUTPUT_DIR, exist_ok=True)
os.makedirs(TEXTRANK_OUTPUT_DIR, exist_ok=True)

In [29]:
import json
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        paper = json.load(f)
    sentences = []
    for sec in paper['sections']:
        doc = nlp(sec['content'])
        sentences.extend([sent for sent in doc.sents])
    return sentences


In [30]:
def clean_sentences(sentences):
    sentences_cleaned = []
    for sent in sentences:
        words = [word.text for word in sent if not word.is_punct]
        sentences_cleaned.append(" ".join(words))
    return sentences_cleaned


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_tfidf_similarity(sentences_cleaned):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences_cleaned)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return tfidf_matrix, similarity_matrix


In [32]:
import networkx as nx

def compute_textrank(similarity_matrix, damping=0.85, max_iter=100):
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph, alpha=damping, max_iter=max_iter)
    # Convert to list sorted by sentence index
    textrank_scores = [scores[i] for i in range(len(scores))]
    return textrank_scores


In [36]:
# Process all files
for file_name in os.listdir(INPUT_DIR):
    if file_name.endswith(".json"):
        file_path = os.path.join(INPUT_DIR, file_name)

        # Extract and clean sentences
        sentences = extract_sentences(file_path)
        sentences_cleaned = clean_sentences(sentences)

        # TF-IDF and similarity
        tfidf_matrix, similarity_matrix = compute_tfidf_similarity(sentences_cleaned)

        # Save TF-IDF similarity matrix
        tfidf_file = os.path.join(TF_IDF_OUTPUT_DIR, file_name.replace('.json', '_similarity.npy'))
        np.save(tfidf_file, similarity_matrix)

        # Compute TextRank scores
        textrank_scores = compute_textrank(similarity_matrix)

        # Save TextRank scores
        tr_file = os.path.join(TEXTRANK_OUTPUT_DIR, file_name.replace('.json', '_TextRank.txt'))
        with open(tr_file, 'w', encoding='utf-8') as f:
            for score in textrank_scores:
                f.write(f"{score:.6f}\n")

        print(f"Processed {file_name} -> TF-IDF and TextRank saved")

Processed Cloud intelligent track - Risk analysis and privacy data management in the cloud computing.json -> TF-IDF and TextRank saved
Processed Real-Time Air Quality Monitoring with Edge AI and Machine Learning Algorithm.json -> TF-IDF and TextRank saved
Processed A Bibliometric View of AI Ethics Development.json -> TF-IDF and TextRank saved
Processed Software Defined Privacy.json -> TF-IDF and TextRank saved
Processed Privacy-Diffusion Privacy-Preserving Stable Diffusion Without Homomorphic Encryption.json -> TF-IDF and TextRank saved
Processed WIP Using Stories from Traditional Culture to Teach Virtue-Based Engineering Ethics.json -> TF-IDF and TextRank saved
Processed Ethics of Artificial Intelligence in University Education.json -> TF-IDF and TextRank saved
Processed Methodological Reflections on Designing Surveys to Explore Faculty and Administrator Perceptions of AI Ethics Education.json -> TF-IDF and TextRank saved
Processed Applying Communication Privacy Management Theory to Y

In [40]:
from google.colab import files
import shutil

# Path to your TF-IDF folder
tfidf_folder = '/content/drive/MyDrive/TF-ID_similarity_matrix/'

# Path for the zip file
tfidf_zip = '/content/drive/MyDrive/TF-ID_similarity_matrix.zip'

# Zip the folder
shutil.make_archive(tfidf_zip.replace('.zip',''), 'zip', tfidf_folder)

# Download the zip
files.download(tfidf_zip)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
from google.colab import files
import shutil

# Path to your TextRank folder
textrank_folder = '/content/drive/MyDrive/Textrank_scores/'

# Path for the zip file
textrank_zip = '/content/drive/MyDrive/Textrank_scores.zip'

# Zip the folder
shutil.make_archive(textrank_zip.replace('.zip',''), 'zip', textrank_folder)

# Download the zip
files.download(textrank_zip)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>