<a href="https://colab.research.google.com/github/springboardmentor3847a-cloud/AI-System-to-Automatically-Review-and-Summarize-Research-Papers-/blob/HarshithaNancharla-Branch/Milestone%202.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pymupdf pymupdf4llm nltk scikit-learn


Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pymupdf4llm
  Downloading pymupdf4llm-0.2.8-py3-none-any.whl.metadata (7.5 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf4llm-0.2.8-py3-none-any.whl (68 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, pymupdf4llm
Successfully installed pymupdf-1.26.7 pymupdf4llm-0.2.8


In [2]:
import os

folders = [
    "data/pdfs",
    "data/extracted_text",
    "data/structured_sections",
    "data/comparisons"
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("Folder structure created ✅")


Folder structure created ✅


In [3]:
SEMANTIC_SCHOLAR_API_KEY = "OWZwvFDAAe9QXGhc0m66v2SWJgZ00c3CaZzDSLQq"


In [4]:
import requests

headers = {
    "x-api-key": SEMANTIC_SCHOLAR_API_KEY
}

query = "fake news detection using machine learning"
url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={query}&limit=5&fields=title,openAccessPdf"

response = requests.get(url, headers=headers)
papers = response.json()["data"]

for i, paper in enumerate(papers):
    pdf_info = paper.get("openAccessPdf")
    if pdf_info and pdf_info.get("url"):
        pdf_url = pdf_info["url"]
        pdf_data = requests.get(pdf_url).content

        file_path = f"data/pdfs/paper_{i+1}.pdf"
        with open(file_path, "wb") as f:
            f.write(pdf_data)

        print(f"Downloaded: {file_path}")


Downloaded: data/pdfs/paper_4.pdf
Downloaded: data/pdfs/paper_5.pdf


In [5]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

for pdf_file in os.listdir("data/pdfs"):
    pdf_path = f"data/pdfs/{pdf_file}"
    text = extract_text_from_pdf(pdf_path)

    with open(f"data/extracted_text/{pdf_file}.txt", "w", encoding="utf-8") as f:
        f.write(text)

print("Text extracted from all PDFs ✅")


Text extracted from all PDFs ✅


In [6]:
import re

def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Page \d+', '', text)
    return text.strip()

for file in os.listdir("data/extracted_text"):
    path = f"data/extracted_text/{file}"

    with open(path, "r", encoding="utf-8") as f:
        cleaned = clean_text(f.read())

    with open(path, "w", encoding="utf-8") as f:
        f.write(cleaned)

print("Text cleaned ✅")


Text cleaned ✅


In [7]:
def split_sections(text):
    sections = {
        "abstract": "",
        "introduction": "",
        "methodology": "",
        "results": "",
        "conclusion": ""
    }

    text_lower = text.lower()

    for key in sections.keys():
        if key in text_lower:
            start = text_lower.find(key)
            sections[key] = text[start:start+1500]

    return sections


In [8]:
import json

for file in os.listdir("data/extracted_text"):
    with open(f"data/extracted_text/{file}", "r", encoding="utf-8") as f:
        text = f.read()

    sections = split_sections(text)

    json_path = f"data/structured_sections/{file}.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(sections, f, indent=4)

print("Structured JSON created ✅")


Structured JSON created ✅


In [9]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def extract_key_sentences(text, num=3):
    sentences = sent_tokenize(text)
    return sentences[:num]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

texts = []

for file in os.listdir("data/extracted_text"):
    with open(f"data/extracted_text/{file}", "r", encoding="utf-8") as f:
        texts.append(f.read())

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(texts)
similarity = cosine_similarity(tfidf)

print("Similarity Matrix:")
print(similarity)


Similarity Matrix:
[[1.         0.85077773]
 [0.85077773 1.        ]]
