In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from difflib import Differ
import pdfplumber

def pdf_to_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def compare_text(text1, text2):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

    text_pair = [(text1, text2)]
    inputs = tokenizer.batch_encode_plus(
        text_pair, add_special_tokens=True, return_tensors="pt", padding=True, truncation=True
    )
    
    with torch.no_grad():
        logits = model(**inputs).logits

    predictions = torch.argmax(logits, dim=1).tolist()
    return predictions[0]  # 0 for similar, 1 for different

def highlight_differences_with_line_numbers(text1, text2):
    differ = Differ()
    diff = list(differ.compare(text1.splitlines(), text2.splitlines()))
    changes = 0
    highlighted_diff = []
    onlyChanges =[]
    for line_number, line in enumerate(diff, start=1):
        if line.startswith('- '):
            highlighted_diff.append(f'Line {line_number}: <span style="background-color: #ffcccc;">{line}</span>')
            onlyChanges.append(f'Line {line_number}: <span style="background-color: #ffcccc;">{line}</span>')

            changes += 1
        elif line.startswith('+ '):
            highlighted_diff.append(f'Line {line_number}: <span style="background-color: #ccffcc;">{line}</span>')
            onlyChanges.append(f'Line {line_number}: <span style="background-color: #ccffcc;">{line}</span>')
            changes += 1
        else:
            highlighted_diff.append(line)
    return '\n'.join(highlighted_diff), changes , '\n'.join(onlyChanges)

def main():
    pdf1_text = pdf_to_text('Email1.pdf')
    pdf2_text = pdf_to_text('Email2.pdf')

    prediction = compare_text(pdf1_text, pdf2_text)
    if prediction == 0:
        print("The texts are similar.")
    else:
        print("The texts are different.")

    diff_highlighted, total_changes , diff_changes = highlight_differences_with_line_numbers(pdf1_text, pdf2_text)
    print(f"Total Possible Changes: {total_changes}\n")
    print(diff_changes)
    print(diff_highlighted)

if __name__ == "__main__":
    main()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


The texts are similar.
Total Possible Changes: 7

Line 20: <span style="background-color: #ffcccc;">- Dear Mr. Arnold:</span>
Line 21: <span style="background-color: #ccffcc;">+ Dear Mr. Arnold:R ohit Sharma</span>
Line 38: <span style="background-color: #ffcccc;">- Your name</span>
Line 39: <span style="background-color: #ccffcc;">+ Your name Rohit Sharma</span>
Line 40: <span style="background-color: #ffcccc;">- Your address</span>
Line 41: <span style="background-color: #ccffcc;">+ Your address Mahua</span>
Line 44: <span style="background-color: #ccffcc;">+ Bihar ,844122</span>
  SAMPLE INFORMATIONAL INTERVIEW EMAILS
  *Do not copy these emails.
  Email No. 1
  Subject: Greetings from a Washington University alumnus
  Dear Ms. Brown:
  I am a first-year student at John Marshall Law School in Atlanta and am writing to you as a
  fellow Washington University alumnus. I am contacting you because I want to make effective
  use of my first-year summer. I believe I would enjoy a career i