In [19]:
import fitz  # PyMuPDF
import re

def preprocess_pdf(pdf_path, clean_output="test/raw/Dsa_clean.txt", start_page=11):
    """
    Extracts text from a PDF starting at `start_page`, cleans it, removes tables/charts/diagrams,
    and saves only the cleaned text.
    """
    # ===== Step 1: Extract text from PDF =====
    doc = fitz.open(pdf_path)
    full_text = []

    for page_num in range(start_page, len(doc)):  # skip first `start_page` pages
        page = doc[page_num]
        text = page.get_text("text")
        full_text.append(text)

    all_text = "\n".join(full_text)
    print(f"✅ Extracted {len(doc) - start_page} pages (skipped first {start_page})")

    # ===== Step 2: Clean the text =====
    text = all_text

    # 1. Remove standalone page numbers (lines with only digits)
    text = re.sub(r"^\s*\d+\s*$", " ", text, flags=re.MULTILINE)

    # 2. Remove lines that are likely tables, charts, or diagrams
    text = re.sub(r"^[\d\W_]{5,}$", "", text, flags=re.MULTILINE)

    # 3. Remove multiple newlines → keep max 2
    text = re.sub(r"\n{2,}", "\n\n", text)

    # 4. Remove extra spaces
    text = re.sub(r"[ \t]+", " ", text)

    # Remove page numbers and figures
    text = re.sub(r'\b\d+\b', '', text)
    # Remove unwanted words
    text = re.sub(r'\b(chapter|figure|table|end)\b', '', text)

    # 5. Normalize weird characters
    text = text.encode("utf-8", "ignore").decode()

    # 6. Strip leading/trailing spaces per line and remove empty lines
    text = "\n".join(line.strip() for line in text.splitlines() if line.strip())

    

    # Save cleaned text
    with open(clean_output, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"✅ Cleaned text saved to {clean_output}")
    return text


# ===== Example usage =====
pdf_path = "test/raw/Dsa.pdf"
cleaned_text = preprocess_pdf(pdf_path, start_page=11)

len(cleaned_text)


✅ Extracted 101 pages (skipped first 11)
✅ Cleaned text saved to test/raw/Dsa_clean.txt


128733

In [46]:
import re

def split_into_chunks(text, chunk_size=20):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

texts = split_into_chunks(cleaned_text, 200)


In [20]:
texts = []

with open('test/raw/Dsa_clean.txt', 'r') as f:
    for line in f:
        texts.append(line)

In [27]:
len(texts)

3648

In [24]:
import nltk
import json
from tqdm import tqdm
import pandas as pd
from umap import UMAP
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import os

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
umap_model=UMAP(n_neighbors=20,n_components=50,metric="cosine",min_dist=0.0,random_state=37)
vectorizer_model=CountVectorizer(ngram_range=(1,3),stop_words="english")
ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=False)
sentence_model=SentenceTransformer("paraphrase-mpnet-base-v2")
representation_model = KeyBERTInspired()

topic_model=BERTopic(verbose=True,
                     umap_model=umap_model,
                     ctfidf_model=ctfidf_model,
                     vectorizer_model=vectorizer_model,
                     embedding_model=sentence_model,
                     representation_model=representation_model,
                     nr_topics=50,
                     low_memory=True,
                     calculate_probabilities=False)

In [47]:
topics, _ = topic_model.fit_transform(texts)

2025-09-03 17:00:12,850 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 4/4 [00:23<00:00,  5.79s/it]
2025-09-03 17:00:36,231 - BERTopic - Embedding - Completed ✓
2025-09-03 17:00:36,237 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 17:00:36,848 - BERTopic - Dimensionality - Completed ✓
2025-09-03 17:00:36,850 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 17:00:36,911 - BERTopic - Cluster - Completed ✓
2025-09-03 17:00:36,912 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-09-03 17:00:37,046 - BERTopic - Representation - Completed ✓
2025-09-03 17:00:37,047 - BERTopic - Topic reduction - Reducing number of topics
2025-09-03 17:00:37,048 - BERTopic - Topic reduction - Number of topics (50) is equal or higher than the clustered topics(5).
2025-09-03 17:00:37,048 - BERTopic - Representation - Fine-tuning topics using representation models.
2

In [40]:
all_topics = topic_model.get_topics()

In [41]:
concepts=[]

for topic_num, keywords in all_topics.items():
    if topic_num != -1:
        topic_keywords = [word for word, value in keywords]
        concepts.extend(topic_keywords)

In [42]:
# remove duplicates
concepts = list(set(keyword.lower() for keyword in concepts))

In [45]:
with open("output/extracted_concepts_with_preprocessing.tsv", "w") as f:
    for id, concept in enumerate(concepts, 1):
        f.write(f"{id}|{concept}\n")