<a href="https://colab.research.google.com/github/rshriroop01/rshriroop01/blob/main/hack24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
from PyPDF2 import PdfReader
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load BERT model
model = BertModel.from_pretrained("bert-base-uncased")

def pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def process_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    # Split the text into sentences
    sentences = [sent.text for sent in doc.sents]
    return sentences

def get_bert_embeddings(sentences):
    # Tokenize and encode the sentences
    encoded_sentences = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

    # Extract embeddings from BERT model
    with torch.no_grad():
        outputs = model(**encoded_sentences)
        embeddings = outputs.last_hidden_state  # Use last hidden state as embeddings

    # Reshape embeddings to remove extra dimensions
    embeddings = torch.squeeze(embeddings, dim=0)

    return embeddings

def calculate_similarity(embeddings1, embeddings2, sentences1, sentences2):

    embeddings1 = embeddings1.reshape(-1, embeddings1.shape[-1])
    embeddings2 = embeddings2.reshape(-1, embeddings2.shape[-1])
    # Convert embeddings to numpy arrays
    embeddings1 = embeddings1.numpy()
    embeddings2 = embeddings2.numpy()

    # Check if embeddings are empty
    if embeddings1.size == 0 or embeddings2.size == 0:
        print("Error: Empty embeddings")
        return

    # Check the dimensions of embeddings
    print("Embeddings from PDF:", embeddings1.shape)
    print("Embeddings from list of sentences:", embeddings2.shape)

    # Reshape embeddings if necessary
    if embeddings1.shape[1] != embeddings2.shape[1]:
        min_features = min(embeddings1.shape[1], embeddings2.shape[1])
        embeddings1 = embeddings1[:, :min_features]
        embeddings2 = embeddings2[:, :min_features]

    # Calculate cosine similarity between embeddings of two sets of sentences
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)

    # Print similarity matrix
    print("Similarity Matrix:")
    print(similarity_matrix)

    # Find the most similar pairs of sentences
    most_similar_indices = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)

    # Check if indices are out of range
    if most_similar_indices[0] >= len(sentences1) or most_similar_indices[1] >= len(sentences2):
        print("Error: Most similar indices out of range")
        return

    most_similar_sentence_pair = (sentences1[most_similar_indices[0]], sentences2[most_similar_indices[1]])

    print("\nMost Similar Sentence Pair:")
    print(most_similar_sentence_pair)

def main():
    pdf_path = "/2022_ESCO_ESG_Report.pdf"
    text_from_pdf = pdf_to_text(pdf_path)
    sentences_from_pdf = process_text(text_from_pdf)
    print("Extracted Sentences from PDF:")
    print(sentences_from_pdf)

    # Define the sentences
    sentences = [
        'msci sustainalytics',
        'net zero target',
        'interim emissions reduction target',
        'Renewable elecricity targets',
        'circularity strategy targets',
        'diversity ,equity and inclusion Target',
        'Employee health and safety audits',
        'supply chain audits'
    ]

    # Get BERT embeddings for sentences extracted from the PDF and the provided list of sentences
    embeddings_from_pdf = get_bert_embeddings(sentences_from_pdf)
    embeddings = get_bert_embeddings(sentences)

    # Calculate similarity between embeddings of the two sets of sentences
    calculate_similarity(embeddings_from_pdf, embeddings, sentences_from_pdf, sentences)

if __name__ == "__main__":
    main()


Extracted Sentences from PDF:
['ESG REPORT2022\nESCO TECHNOLOGIES INC.ABOUT CONTENTS ENVIRONMENTAL SOCIAL GOVERNANCE  2022 ESG Report  /  ESCO Technologies Inc.2\nCONTENTS\nContents\nABOUT ESCO TECHNOLOGIES INC.\n', 'A Message from Our CEO  ................................. 4\nAbout ESCO Technologies Inc.  ..........................', '5ENVIRONMENTAL\nEnvironmental  ................................................. 7\nESCO Participates in the Green  \nBusiness Challenge  .......................................... 8\nESCO’s Subsidiary NRG Systems, Inc.  \nMakes Clean Power Possible  ............................', '9\nServing the Wind Industry  ..............................', '10\nServing the Solar Industry  .............................. 11\n2022 NRG Projects  ....................................... 13\nESCO Environmental Footprint  ....................... 14SOCIAL\nSocial Highlights for 2022  .............................', '18\nWorkplace Health & Safety  .............................

In [33]:
pip install pdfplumber transformers




In [34]:
pip install PyPDF2



In [35]:
pip install spacy



In [36]:
pip install nltk



In [37]:
pip install en_core_web_sm




In [23]:
# import spacy
# import PyPDF2
# import nltk
# from nltk.tokenize import sent_tokenize

# # Load the English language model in spaCy
# nlp = spacy.load("en_core_web_sm")

# def pdf_to_text(pdf_path):
#     text = ""
#     with open(pdf_path, "rb") as file:
#         reader = PyPDF2.PdfFileReader(file)
#         num_pages = reader.numPages
#         for page_num in range(num_pages):
#             page = reader.getPage(page_num)
#             text += page.extractText()
#     return text

# def process_text(text):
#     # Split the text into sentences using spaCy
#     doc = nlp(text)
#     sentences = [sent.text for sent in doc.sents]

#     # Tokenize each sentence using NLTK
#     tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

#     return tokenized_sentences

# def main():
#     pdf_path = "/2022_ESCO_ESG_Report.pdf"
#     text = pdf_to_text(pdf_path)
#     tokenized_sentences = process_text(text)
#     for sentence_tokens in tokenized_sentences:
#         print(sentence_tokens)

# if __name__ == "__main__":
#     main()
import spacy
from PyPDF2 import PdfReader

# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

def pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def process_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    # Split the text into sentences
    sentences = [sent.text for sent in doc.sents]
    return sentences

def main():
    pdf_path = "/2022_ESCO_ESG_Report.pdf"
    text = pdf_to_text(pdf_path)
    sentences = process_text(text)
    print(sentences)

if __name__ == "__main__":
    main()


['ESG REPORT2022\nESCO TECHNOLOGIES INC.ABOUT CONTENTS ENVIRONMENTAL SOCIAL GOVERNANCE  2022 ESG Report  /  ESCO Technologies Inc.2\nCONTENTS\nContents\nABOUT ESCO TECHNOLOGIES INC.\n', 'A Message from Our CEO  ................................. 4\nAbout ESCO Technologies Inc.  ..........................', '5ENVIRONMENTAL\nEnvironmental  ................................................. 7\nESCO Participates in the Green  \nBusiness Challenge  .......................................... 8\nESCO’s Subsidiary NRG Systems, Inc.  \nMakes Clean Power Possible  ............................', '9\nServing the Wind Industry  ..............................', '10\nServing the Solar Industry  .............................. 11\n2022 NRG Projects  ....................................... 13\nESCO Environmental Footprint  ....................... 14SOCIAL\nSocial Highlights for 2022  .............................', '18\nWorkplace Health & Safety  ............................. 19\nCyber Security  .........

In [29]:
import spacy
from PyPDF2 import PdfReader
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load BERT model
model = BertModel.from_pretrained("bert-base-uncased")

def pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def process_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    # Split the text into sentences
    sentences = [sent.text for sent in doc.sents]
    return sentences

def get_bert_embeddings(sentences):
    # Tokenize and encode the sentences
    encoded_sentences = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

    # Extract embeddings from BERT model
    with torch.no_grad():
        outputs = model(**encoded_sentences)
        embeddings = outputs.last_hidden_state  # Use last hidden state as embeddings

    # Reshape embeddings to remove extra dimensions
    embeddings = torch.squeeze(embeddings, dim=0)

    return embeddings

def calculate_similarity(embeddings1, embeddings2):
    # Reshape embeddings to 2D arrays
    embeddings1 = embeddings1.reshape(-1, embeddings1.shape[-1])
    embeddings2 = embeddings2.reshape(-1, embeddings2.shape[-1])

    # Convert embeddings to numpy arrays
    embeddings1 = embeddings1.numpy()
    embeddings2 = embeddings2.numpy()

    # Check the dimensions of embeddings
    print("Embeddings from PDF:", embeddings1.shape)
    print("Embeddings from list of sentences:", embeddings2.shape)

    # Calculate cosine similarity between embeddings of two sets of sentences
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)

    # Print similarity matrix
    print("Similarity Matrix:")
    print(similarity_matrix)

    # Find the most similar pairs of sentences
    most_similar_indices = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)
    most_similar_sentence_pair = (sentences1[most_similar_indices[0]], sentences2[most_similar_indices[1]])

    print("\nMost Similar Sentence Pair:")
    print(most_similar_sentence_pair)

def main():
    pdf_path = "/2022_ESCO_ESG_Report.pdf"
    text_from_pdf = pdf_to_text(pdf_path)
    sentences_from_pdf = process_text(text_from_pdf)
    print("Extracted Sentences from PDF:")
    print(sentences_from_pdf)

    # Define the sentences
    sentences = [
        'msci sustainalytics',
        'net zero target',
        'interim emissions reduction target',
        'Renewable elecricity targets',
        'circularity strategy targets',
        'diversity ,equity and inclusion Target',
        'Employee health and safety audits',
        'supply chain audits'
    ]

    # Get BERT embeddings for sentences extracted from the PDF and the provided list of sentences
    embeddings_from_pdf = get_bert_embeddings(sentences_from_pdf)
    embeddings = get_bert_embeddings(sentences)

    # Calculate similarity between embeddings of the two sets of sentences
    calculate_similarity(embeddings_from_pdf, embeddings)

if __name__ == "__main__":
    main()


Extracted Sentences from PDF:
['ESG REPORT2022\nESCO TECHNOLOGIES INC.ABOUT CONTENTS ENVIRONMENTAL SOCIAL GOVERNANCE  2022 ESG Report  /  ESCO Technologies Inc.2\nCONTENTS\nContents\nABOUT ESCO TECHNOLOGIES INC.\n', 'A Message from Our CEO  ................................. 4\nAbout ESCO Technologies Inc.  ..........................', '5ENVIRONMENTAL\nEnvironmental  ................................................. 7\nESCO Participates in the Green  \nBusiness Challenge  .......................................... 8\nESCO’s Subsidiary NRG Systems, Inc.  \nMakes Clean Power Possible  ............................', '9\nServing the Wind Industry  ..............................', '10\nServing the Solar Industry  .............................. 11\n2022 NRG Projects  ....................................... 13\nESCO Environmental Footprint  ....................... 14SOCIAL\nSocial Highlights for 2022  .............................', '18\nWorkplace Health & Safety  .............................

NameError: name 'sentences1' is not defined

In [31]:
import spacy
from PyPDF2 import PdfReader
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load BERT model
model = BertModel.from_pretrained("bert-base-uncased")

def pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def process_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    # Split the text into sentences
    sentences = [sent.text for sent in doc.sents]
    return sentences

def get_bert_embeddings(sentences):
    # Tokenize and encode the sentences
    encoded_sentences = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

    # Extract embeddings from BERT model
    with torch.no_grad():
        outputs = model(**encoded_sentences)
        embeddings = outputs.last_hidden_state  # Use last hidden state as embeddings

    # Reshape embeddings to remove extra dimensions
    embeddings = torch.squeeze(embeddings, dim=0)

    return embeddings

def calculate_similarity(embeddings1, embeddings2, sentences1, sentences2):

    embeddings1 = embeddings1.reshape(-1, embeddings1.shape[-1])
    embeddings2 = embeddings2.reshape(-1, embeddings2.shape[-1])
    # Convert embeddings to numpy arrays
    embeddings1 = embeddings1.numpy()
    embeddings2 = embeddings2.numpy()

    # Check if embeddings are empty
    if embeddings1.size == 0 or embeddings2.size == 0:
        print("Error: Empty embeddings")
        return

    # Check the dimensions of embeddings
    print("Embeddings from PDF:", embeddings1.shape)
    print("Embeddings from list of sentences:", embeddings2.shape)

    # Reshape embeddings if necessary
    if embeddings1.shape[1] != embeddings2.shape[1]:
        min_features = min(embeddings1.shape[1], embeddings2.shape[1])
        embeddings1 = embeddings1[:, :min_features]
        embeddings2 = embeddings2[:, :min_features]

    # Calculate cosine similarity between embeddings of two sets of sentences
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)

    # Print similarity matrix
    print("Similarity Matrix:")
    print(similarity_matrix)

    # Find the most similar pairs of sentences
    most_similar_indices = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)

    # Check if indices are out of range
    if most_similar_indices[0] >= len(sentences1) or most_similar_indices[1] >= len(sentences2):
        print("Error: Most similar indices out of range")
        return

    most_similar_sentence_pair = (sentences1[most_similar_indices[0]], sentences2[most_similar_indices[1]])

    print("\nMost Similar Sentence Pair:")
    print(most_similar_sentence_pair)

def main():
    pdf_path = "/2022_ESCO_ESG_Report.pdf"
    text_from_pdf = pdf_to_text(pdf_path)
    sentences_from_pdf = process_text(text_from_pdf)
    print("Extracted Sentences from PDF:")
    print(sentences_from_pdf)

    # Define the sentences
    sentences = [
        'msci sustainalytics',
        'net zero target',
        'interim emissions reduction target',
        'Renewable elecricity targets',
        'circularity strategy targets',
        'diversity ,equity and inclusion Target',
        'Employee health and safety audits',
        'supply chain audits'
    ]

    # Get BERT embeddings for sentences extracted from the PDF and the provided list of sentences
    embeddings_from_pdf = get_bert_embeddings(sentences_from_pdf)
    embeddings = get_bert_embeddings(sentences)

    # Calculate similarity between embeddings of the two sets of sentences
    calculate_similarity(embeddings_from_pdf, embeddings, sentences_from_pdf, sentences)

if __name__ == "__main__":
    main()


Extracted Sentences from PDF:
['ESG REPORT2022\nESCO TECHNOLOGIES INC.ABOUT CONTENTS ENVIRONMENTAL SOCIAL GOVERNANCE  2022 ESG Report  /  ESCO Technologies Inc.2\nCONTENTS\nContents\nABOUT ESCO TECHNOLOGIES INC.\n', 'A Message from Our CEO  ................................. 4\nAbout ESCO Technologies Inc.  ..........................', '5ENVIRONMENTAL\nEnvironmental  ................................................. 7\nESCO Participates in the Green  \nBusiness Challenge  .......................................... 8\nESCO’s Subsidiary NRG Systems, Inc.  \nMakes Clean Power Possible  ............................', '9\nServing the Wind Industry  ..............................', '10\nServing the Solar Industry  .............................. 11\n2022 NRG Projects  ....................................... 13\nESCO Environmental Footprint  ....................... 14SOCIAL\nSocial Highlights for 2022  .............................', '18\nWorkplace Health & Safety  .............................

In [40]:
pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14

In [41]:
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

def pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def process_text(text):
    sentences = [sent.strip() for sent in text.split('\n') if sent.strip()]
    return sentences

def calculate_similarity(sentences1, sentences2):
    # Load Universal Sentence Encoder
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    # Generate embeddings for sentences
    embeddings_from_pdf = model.encode(sentences1)
    embeddings = model.encode(sentences2)

    # Calculate similarity between embeddings
    similarity_scores = [[1 - cosine(embedding1, embedding2) for embedding2 in embeddings] for embedding1 in embeddings_from_pdf]

    # Find the most similar pair
    max_similarity = 0
    most_similar_pair = ()
    for i, scores in enumerate(similarity_scores):
        for j, score in enumerate(scores):
            if score > max_similarity:
                max_similarity = score
                most_similar_pair = (sentences1[i], sentences2[j])

    print("Most Similar Sentence Pair:", most_similar_pair)

def main():
    pdf_path = "/2022_ESCO_ESG_Report.pdf"
    text_from_pdf = pdf_to_text(pdf_path)
    sentences_from_pdf = process_text(text_from_pdf)
    print("Extracted Sentences from PDF:")
    print(sentences_from_pdf)

    # Define the sentences
    sentences = [
        'msci sustainalytics',
        'net zero target',
        'interim emissions reduction target',
        'Renewable electricity targets',
        'circularity strategy targets',
        'diversity, equity and inclusion Target',
        'Employee health and safety audits',
        'supply chain audits'
    ]

    # Calculate similarity between sentences from PDF and predefined sentences
    calculate_similarity(sentences_from_pdf, sentences)

if __name__ == "__main__":
    main()


Extracted Sentences from PDF:
['ESG REPORT2022', 'ESCO TECHNOLOGIES INC.ABOUT CONTENTS ENVIRONMENTAL SOCIAL GOVERNANCE  2022 ESG Report  /  ESCO Technologies Inc.2', 'CONTENTS', 'Contents', 'ABOUT ESCO TECHNOLOGIES INC.', 'A Message from Our CEO  ................................. 4', 'About ESCO Technologies Inc.  .......................... 5ENVIRONMENTAL', 'Environmental  ................................................. 7', 'ESCO Participates in the Green', 'Business Challenge  .......................................... 8', 'ESCO’s Subsidiary NRG Systems, Inc.', 'Makes Clean Power Possible  ............................ 9', 'Serving the Wind Industry  .............................. 10', 'Serving the Solar Industry  .............................. 11', '2022 NRG Projects  ....................................... 13', 'ESCO Environmental Footprint  ....................... 14SOCIAL', 'Social Highlights for 2022  ............................. 18', 'Workplace Health & Safety  ...............

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Most Similar Sentence Pair: ('Renewable Energy', 'Renewable electricity targets')


In [None]:
0