In [1]:
!pip install pdfplumber
!pip install nltk
!pip install scikit-learn
!pip install numpy
!pip install spacy
!pip install bert_score

Collecting pdfplumber
  Downloading pdfplumber-0.11.1-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m847.8 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20231228 pdfplumber-0.11.1 pypdfium2-4.30.0
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.

In [2]:
import pdfplumber
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import spacy
from difflib import SequenceMatcher
from bert_score import score as bert_score

In [3]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy model for Named Entity Recognition (NER)
nlp = spacy.load('en_core_web_sm')
stop_words = set(nltk.corpus.stopwords.words('english'))

def extract_text_from_pdf(pdf_path):
    """Extracts text from each page of a PDF and cleans it."""
    pages_text = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    cleaned_text = clean_text(text)
                    table_sentences = extract_rows_from_table(cleaned_text)
                    pages_text.append((i + 1, table_sentences))
    except Exception as e:
        print(f"Error reading PDF: {e}")
    return pages_text



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
def clean_text(text):
    """Cleans the input text for easier processing."""
    text = text.replace('\n', ' ')
    text = re.sub(r'\s{2,}', '. ', text)
    text = re.sub(r'(\d)([A-Za-z])', r'\1. \2', text)
    text = re.sub(r'\.(\w)', r'. \1', text)
    text = re.sub(r'(\w)([A-Z])', r'\1. \2', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [5]:

def extract_rows_from_table(table_text):
    """Extracts sentences from cleaned text."""
    rows = table_text.split('. ')
    sentences = []
    for row in rows:
        columns = re.split(r'\s{2,}', row)
        if columns:
            sentence = ' '.join(columns).strip()
            if sentence:
                sentences.append(sentence)
    return sentences

In [6]:


def tokenize_sentences(pages_text):
    """Tokenizes sentences from the pages text and keeps track of their positions."""
    sentences = []
    sentence_positions = []
    for page_num, page_sentences in pages_text:
        for i, sentence in enumerate(page_sentences):
            sentences.append(sentence)
            sentence_positions.append((page_num, i))
    return sentences, sentence_positions

In [7]:


def filter_important_words(sentence):
    """Filters out stop words and retains important words using spaCy."""
    doc = nlp(sentence)
    important_words = [
        token.text.lower() for token in doc
        if token.text.lower() not in stop_words and (token.ent_type_ or token.pos_ in {'NOUN', 'VERB', 'PROPN', 'NUM'})
    ]
    return ' '.join(important_words)

In [8]:


def compute_lexical_similarity(reference_sentences, candidate_sentence):
    """Computes the lexical similarity between reference and candidate sentences."""
    filtered_candidate = filter_important_words(candidate_sentence)
    filtered_references = [filter_important_words(sentence) for sentence in reference_sentences]
    vectorizer = TfidfVectorizer().fit_transform(filtered_references + [filtered_candidate])
    vectors = vectorizer.toarray()
    cosine_similarities = cosine_similarity(vectors)
    return cosine_similarities[-1][:-1]

In [9]:


def compute_bertscore_similarity(reference_sentences, candidate_sentence):
    """Computes the BERTScore similarity between reference and candidate sentences."""
    P, R, F1 = bert_score([candidate_sentence] * len(reference_sentences), reference_sentences, lang='en', verbose=False)
    return F1.numpy()

In [10]:


def find_most_similar_sentence(reference_sentences, candidate_sentences, sentence_positions):
    """Finds the most similar sentences and computes the mean lexical and BERTScore similarity scores."""
    all_lexical_scores = []
    all_bertscore_scores = []
    most_similar_sentences = []
    most_similar_positions = []

    for candidate_sentence in candidate_sentences:
        lexical_similarities = compute_lexical_similarity(reference_sentences, candidate_sentence)
        most_similar_index_lexical = np.argmax(lexical_similarities)
        most_similar_sentence_lexical = reference_sentences[most_similar_index_lexical]
        lexical_score = lexical_similarities[most_similar_index_lexical]

        bertscore_similarities = compute_bertscore_similarity(reference_sentences, candidate_sentence)
        most_similar_index_bertscore = np.argmax(bertscore_similarities)
        most_similar_sentence_bertscore = reference_sentences[most_similar_index_bertscore]
        bertscore_score = bertscore_similarities[most_similar_index_bertscore]

        all_lexical_scores.append(lexical_score)
        all_bertscore_scores.append(bertscore_score)
        most_similar_sentences.append((most_similar_sentence_lexical, most_similar_sentence_bertscore))
        most_similar_positions.append(sentence_positions[most_similar_index_lexical])

    mean_lexical_score = np.mean(all_lexical_scores)
    mean_bertscore_score = np.mean(all_bertscore_scores)
    return most_similar_sentences, mean_lexical_score, mean_bertscore_score, most_similar_positions

In [14]:
def print_matching_sentences(candidate_sentences, most_similar_sentences, positions):
    """Prints the matching sentences nicely for each candidate sentence."""
    for i, candidate_sentence in enumerate(candidate_sentences):
        lexical_sentence, bertscore_sentence = most_similar_sentences[i]
        page_num, sentence_index = positions[i]
        print(f"Candidate Sentence {i+1}: {candidate_sentence}")
        print(f"  Most Similar Sentence (Lexical Similarity): {lexical_sentence}")
        print(f"  Most Similar Sentence (BERTScore): {bertscore_sentence}")
        print(f"  Position in PDF: Page {page_num}, Sentence Index {sentence_index}")
        print("\n")

In [None]:
def main(pdf_path, candidate_text):
    """Main function to execute the PDF text extraction and similarity finding."""
    pages_text = extract_text_from_pdf(pdf_path)
    reference_sentences, sentence_positions = tokenize_sentences(pages_text)
    candidate_sentences = nltk.sent_tokenize(candidate_text)

    most_similar_sentences, mean_lexical_score, mean_bertscore_score, positions = find_most_similar_sentence(reference_sentences, candidate_sentences, sentence_positions)

    print_matching_sentences(candidate_sentences, most_similar_sentences, positions)

    print(f"Mean Lexical Similarity Score: {mean_lexical_score}")
    print(f"Mean BERTScore Similarity Score: {mean_bertscore_score}")

# Example usage
pdf_path = '/content/2023.pdf'
candidate_text = """ When you enter from the main vehicle entrance, turn to your right. The multi -story building is the library."""

if __name__ == "__main__":
    main(pdf_path, candidate_text)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:

'''

def main(pdf_path, candidate_text):
    """Main function to execute the PDF text extraction and similarity finding."""
    pages_text = extract_text_from_pdf(pdf_path)
    reference_sentences, sentence_positions = tokenize_sentences(pages_text)
    candidate_sentences = nltk.sent_tokenize(candidate_text)

    most_similar_sentences, mean_lexical_score, mean_bertscore_score, positions = find_most_similar_sentence(reference_sentences, candidate_sentences, sentence_positions)

    print(f"Most similar sentences (Lexical Similarity and BERTScore): {most_similar_sentences}")
    print(f"Mean Lexical Similarity Score: {mean_lexical_score}")
    print(f"Mean BERTScore Similarity Score: {mean_bertscore_score}")
    for i, (page_num, sentence_index) in enumerate(positions):
        print(f"Position in PDF for candidate sentence {i+1} - Page: {page_num}, Sentence Index: {sentence_index}")

        # Highlight differences
        #changes = highlight_differences(candidate_sentences[i], most_similar_sentences[i][0])
        #if changes:
            #print(f"Changes in candidate sentence {i+1} (Lexical): {changes}")

        #changes = highlight_differences(candidate_sentences[i], most_similar_sentences[i][1])
        #if changes:
            #print(f"Changes in candidate sentence {i+1} (BERTScore): {changes}")

# Example usage
pdf_path = '/content/2023.pdf'
candidate_text = """There are three departments in the faculty.

14

Message from the Head, Department of Computational Mathematics
Welcome to the Faculty of Computational Mathematics, University of Moratuwa!
On behalf of the Department of Computational Mathematics, I would like to
warmly welcome you to the faculty. It is a great pleasure to see hundreds of
determined and dedicated young adults entrusting their future with the Faculty
of Computational Mathematics. As  you begin your academic career in this
prestigious institution, we congratulate you on your achievement, and your
insight in choosing a program with high demand in this rapidly evolving
discipline.
Department of Computational Mathematics remains one of the main academic
departments providing the nation with professionally qualified mathematicians,
scientists, and researchers in the fields of Computational Mathematics,
Computer Science, and Information Technology. The curricula encompass a
wide variety of subjects in Computational Mathematics, Computer Science,
and Information Technology disciplines to provide both theoretical knowledge
and practical exposure. Furthermore, the Department sets high emphasis on
research studies and group work.
The Department maintains an unwavering reputation for its contribution in
presenting academically sound, competent, and high -quality graduates to
the workforce in the fields of Computational Mathematics, Computer Science,
and Information Technology. There are quite a considerable number of
graduates securing higher studies opportunities and scholarships in top -ranking
international universities, immediately after graduation. We were also
fortunate to produce several IT entrepreneurs whose startup company has grown
into highly reputed award -winning companies with international recognition.
We, the Department of Computational Mathematics, encourage you to envision
your future today, explore opportunities, embrace diversity, and be
competent individuals with direction.
Wish you all a memorable and inspiring stay at the University of Moratuwa!



Mrs. Wijewardene
Head, Department of Computational Mathematics
Tel - office:  0112 -650894 ext.8200
web"""

if __name__ == "__main__":
    main(pdf_path, candidate_text)
'''

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Most similar sentences (Lexical Similarity and BERTScore): [('The three Departments are; • Department of Information Technology • Department of Computational Mathematics • Department of Interdisciplinary Studies Presently, the Departments collaboratively offer subjects for the three undergraduate degree programmes conducted by the faculty', 'There are four main canteens and six other sales centres in the University'), ('Department of Computational Mathematics The Department of Computational Mathematics is one of the key pillars of the Faculty of Information Technology', 'Message from the Head, Department of Information Technology Welcome to the Faculty of Information Technology, University of Moratuwa! On behalf of the Department of Information Technology, I would like to warmly welcome you to the faculty'), ('On behalf of the Department of Computational Mathematics, let me take the opportunity to warmly welcome you to the Faculty of Information Technology at the University of Moratuwa