In [None]:
import os
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Function to read and extract text from a PDF file
def read_pdf(file_path):
    # Open the file in read-binary ('rb') mode because PDFs are binary files.
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)  # Create a PDF reader object to read the PDF.
        content = [page.extract_text() for page in reader.pages]  # Extract text from each page. This is crucial as we need textual data for analysis.
        return " ".join(content)  # Join all the text from all pages to create a continuous string. This makes processing easier.

In [None]:
# Function to preprocess and read multiple PDF documents specified in 'specific_files'
def preprocess_specific_docs(folder_path, specific_files):
    docs = {}  # Initialize an empty dictionary to store the contents of each document. It's a convenient way to keep data organized by filenames.
    for filename in specific_files:
        file_path = os.path.join(folder_path, filename + '.pdf')  # Construct the full file path for each document. This step is necessary to locate each file in the system.
        docs[filename] = read_pdf(file_path)  # Store the content of each PDF in the dictionary, with the filename as the key. This is for easy retrieval later.
    return docs  # Return the dictionary containing the contents of the documents.

In [None]:
# Function to calculate the cosine similarity between the test document vector and each document vector
def calculate_cosine_similarity(vectorizer, docs, test_doc_vector):
    similarities = {}  # Initialize an empty dictionary to store the similarity scores. This helps in comparing each document with the test document.
    for doc_name, doc_vector in docs.items():
        cosine_sim = cosine_similarity(test_doc_vector, doc_vector)  # Calculate the cosine similarity. This step is vital as cosine similarity measures how similar the documents are in terms of content.
        similarities[doc_name] = cosine_sim[0][0]  # Store the similarity score in the dictionary. We are only interested in the actual similarity value.
    return similarities  # Return the dictionary of similarity scores.

In [None]:
# List of specific document names for training. These documents are pre-selected and categorized, serving as a basis for comparison with the test document.
training_docs = [
    'Maths1', 'Maths2', 'Maths3', 'Maths4', 'Maths5',
    'Bio1', 'Bio2', 'Bio3', 'Bio4', 'Bio5',
    'Finance1', 'Finance2', 'Finance3', 'Finance4', 'Finance5'
]


In [None]:
# Setting the folder path to the current directory. This is where the script will look for the PDF files.
folder_path = '/content'

# Preprocess and read the specified training documents. This step is crucial for converting the raw PDFs into a format (text) that can be analyzed.
docs = preprocess_specific_docs(folder_path, training_docs)


[0, IndirectObject(9, 0, 133609524344960)]
[0, IndirectObject(15, 0, 133609524344960)]
[0, IndirectObject(21, 0, 133609524344960)]
[0, IndirectObject(27, 0, 133609524344960)]
[0, IndirectObject(33, 0, 133609524344960)]
[0, IndirectObject(39, 0, 133609524344960)]
[0, IndirectObject(45, 0, 133609524344960)]
[0, IndirectObject(51, 0, 133609524344960)]
[0, IndirectObject(57, 0, 133609524344960)]
[0, IndirectObject(63, 0, 133609524344960)]
[0, IndirectObject(69, 0, 133609524344960)]
[0, IndirectObject(75, 0, 133609524344960)]
[0, IndirectObject(81, 0, 133609524344960)]
[0, IndirectObject(87, 0, 133609524344960)]


In [None]:
!pwd

/content


In [None]:
# Create a TF-IDF vectorizer object. TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used to evaluate the importance of a word in a document, which is part of a corpus. This is crucial for converting text data into numerical data that can be processed.
vectorizer = TfidfVectorizer()
# Transform the training documents into TF-IDF vectors. This conversion is essential for preparing the data for similarity calculation.
doc_vectors = vectorizer.fit_transform(docs.values())

In [None]:
print(doc_vectors)

  (0, 16882)	0.0031738991447470746
  (0, 7082)	0.0024594906347017867
  (0, 8492)	0.0020415884461292195
  (0, 6385)	0.0028337205330362636
  (0, 2898)	0.006347798289494149
  (0, 9467)	0.006347798289494149
  (0, 2536)	0.0015150939540463
  (0, 6495)	0.0018827094077568838
  (0, 16625)	0.0031738991447470746
  (0, 1926)	0.0027559969561745078
  (0, 0)	0.0016236862575566525
  (0, 17312)	0.0031738991447470746
  (0, 16109)	0.0031738991447470746
  (0, 14128)	0.0014168602665181318
  (0, 2399)	0.0018827094077568838
  (0, 9047)	0.0015150939540463
  (0, 13708)	0.002229502464091588
  (0, 12966)	0.0031738991447470746
  (0, 5064)	0.0031738991447470746
  (0, 2040)	0.0027559969561745078
  (0, 4413)	0.0027559969561745078
  (0, 1783)	0.0031738991447470746
  (0, 1698)	0.0027559969561745078
  (0, 14874)	0.0027559969561745078
  (0, 1200)	0.0031738991447470746
  :	:
  (14, 2882)	0.007438223072851873
  (14, 15014)	0.0032601231555539067
  (14, 9164)	0.0019254812534818988
  (14, 16048)	0.0052938929323676056
  (14, 

In [None]:
# Read and vectorize the test document separately. This separation ensures that the test document is not influencing the TF-IDF calculation of the training set.
test_doc_name = 'test_document.pdf'
test_doc_content = read_pdf(os.path.join(folder_path, test_doc_name))
test_doc_vector = vectorizer.transform([test_doc_content])

In [None]:
# Calculate the cosine similarity between the test document and training documents. This step is the core of the script, determining how similar the test document is to each training document.
similarities = calculate_cosine_similarity(vectorizer, dict(zip(docs.keys(), doc_vectors)), test_doc_vector)

In [None]:
# Find the document with the highest similarity score. This step identifies which category the test document is most likely to belong to.
most_similar_doc = max(similarities, key=similarities.get)
# Extract the category from the filename of the most similar document. The naming convention of the training documents is used here to determine the category.
category = most_similar_doc.split('1')[0]  # Assuming the category is indicated by the first part of the filename
print(f"The test document is most similar to category: {category}")

The test document is most similar to category: Maths4


In [None]:

# Calculate the cosine similarities between the test document and each document in the training set
average_similarities = calculate_cosine_similarity(vectorizer, dict(zip(docs.keys(), doc_vectors)), test_doc_vector)

In [None]:
# This step calculates the average similarity of the test document with each category.
# It's important because it assesses how similar the test document is to each category as a whole, rather than to individual documents.
# This approach can sometimes provide a more accurate categorization, especially when the test document shares similarities with multiple documents in a single category.


In [None]:
# Find the category with the highest average similarity score
most_similar_category = max(average_similarities, key=average_similarities.get)
print(f"Average similarities with each category: {average_similarities}")
print(f"The test document is most similar to the category: {most_similar_category} (highest average similarity)")

Average similarities with each category: {'Maths1': 0.56531073689647, 'Maths2': 0.34087477732528065, 'Maths3': 0.4766718724517198, 'Maths4': 0.7333055496855635, 'Maths5': 0.5642719282123181, 'Bio1': 0.600026119031007, 'Bio2': 0.6081045738301034, 'Bio3': 0.635890712946899, 'Bio4': 0.7051929630629038, 'Bio5': 0.627522008609555, 'Finance1': 0.685658033878281, 'Finance2': 0.5210012093508289, 'Finance3': 0.6629233092468131, 'Finance4': 0.6463871133154039, 'Finance5': 0.5619454895337315}
The test document is most similar to the category: Maths4 (highest average similarity)
