In [2]:
pip install PyMuPDF nltk scikit-learn

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl (16.6 MB)
     ---------------------------------------- 0.0/16.6 MB ? eta -:--:--
      --------------------------------------- 0.3/16.6 MB 9.9 MB/s eta 0:00:02
     -- ------------------------------------- 0.9/16.6 MB 11.4 MB/s eta 0:00:02
     --- ------------------------------------ 1.5/16.6 MB 10.2 MB/s eta 0:00:02
     ---- ----------------------------------- 2.0/16.6 MB 11.4 MB/s eta 0:00:02
     ------ --------------------------------- 2.5/16.6 MB 11.5 MB/s eta 0:00:02
     ------- -------------------------------- 3.1/16.6 MB 11.5 MB/s eta 0:00:02
     -------- ------------------------------- 3.6/16.6 MB 11.5 MB/s eta 0:00:02
     --------- ------------------------------ 4.1/16.6 MB 11.3 MB/s eta 0:00:02
     ----------- ---------------------------- 4.7/16.6 MB 11.5 MB/s eta 0:00:02
     ------------ --------------------------- 5.3/16.6 MB 11.6 MB/s eta 0:00:01
     ------------- -------------------------- 5


[notice] A new release of pip is available: 23.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import fitz  # PyMuPDF
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Step 1: Read PDF file and extract text
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Change to your PDF file path
pdf_text = extract_text_from_pdf("Sampledata_7.pdf")  # <-- Replace with your file

print("\nOriginal PDF Text (first 300 characters):\n")
print(pdf_text[:300])  # Preview only

# Step 2: Text Preprocessing
tokens = word_tokenize(pdf_text)
print("\nTokens:\n", tokens[:20])  # Preview tokens

# POS Tagging
pos_tags = pos_tag(tokens)
print("\nPOS Tags:\n", pos_tags[:10])  # Preview tags

# Stopwords removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
print("\nFiltered Tokens (no stopwords):\n", filtered_tokens[:20])

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\nStemmed Tokens:\n", stemmed_tokens[:20])

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nLemmatized Tokens:\n", lemmatized_tokens[:20])

# Step 3: TF-IDF Representation
documents = [pdf_text]  # Single-document example
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

print("\nTF-IDF Feature Names:\n", vectorizer.get_feature_names_out()[:20])  # First 20 features
print("\nTF-IDF Matrix:\n", X.toarray())  # Show full TF-IDF values


[nltk_data] Downloading package punkt to C:\Users\Rahul
[nltk_data]     Wanjare\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Rahul
[nltk_data]     Wanjare\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rahul Wanjare\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Rahul
[nltk_data]     Wanjare\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Original PDF Text (first 300 characters):

Welcome to Smallpdf
Digital Documents—All In One Place
Access Files Anytime, Anywhere 
Enhance Documents in One Click 
Collaborate With Others 
With the new Smallpdf experience, you can 
freely upload, organize, and share digital 
documents. When you enable the ‘Storage’ 
option, we’ll also store al

Tokens:
 ['Welcome', 'to', 'Smallpdf', 'Digital', 'Documents—All', 'In', 'One', 'Place', 'Access', 'Files', 'Anytime', ',', 'Anywhere', 'Enhance', 'Documents', 'in', 'One', 'Click', 'Collaborate', 'With']

POS Tags:
 [('Welcome', 'VB'), ('to', 'TO'), ('Smallpdf', 'NNP'), ('Digital', 'NNP'), ('Documents—All', 'NNP'), ('In', 'IN'), ('One', 'CD'), ('Place', 'NNP'), ('Access', 'NNP'), ('Files', 'NNP')]

Filtered Tokens (no stopwords):
 ['Welcome', 'Smallpdf', 'Digital', 'One', 'Place', 'Access', 'Files', 'Anytime', 'Anywhere', 'Enhance', 'Documents', 'One', 'Click', 'Collaborate', 'Others', 'new', 'Smallpdf', 'experience', 'freely', 'upload']

Stemme