In [1]:
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv,dotenv_values
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
FILE_PATH = "C:/Ambarish/NCERT/CHAP04-BIOLOGY-CLASS11.pdf"

In [3]:
load_dotenv()
values_env = dotenv_values(".env")
MODEL_NAME = values_env['MODEL_NAME']

In [4]:
# Read the PDF file and return the text
def get_pdf_data(file_path, num_pages = 1):
    reader = PdfReader(file_path)
    full_doc_text = ""
    pages = reader.pages
    num_pages = len(pages) 
    
    try:
        for page in range(num_pages):
            current_page = reader.pages[page]
            text = current_page.extract_text()
            full_doc_text += text
    except:
        print("Error reading file")
    finally:
        return full_doc_text

In [5]:
# Divide the text into chunks of chunk_length 
# [ default is 500] characters
def get_chunks(fulltext:str,chunk_length =500) -> list:
    text = fulltext

    chunks = []
    while len(text) > chunk_length:
        last_period_index = text[:chunk_length].rfind('.')
        if last_period_index == -1:
            last_period_index = chunk_length
        chunks.append(text[:last_period_index])
        text = text[last_period_index+1:]
    chunks.append(text)

    return chunks

In [6]:
filename = FILE_PATH

In [7]:
FILE_PATH.split('/')[-1]

'CHAP04-BIOLOGY-CLASS11.pdf'

In [8]:
full_doc_text = get_pdf_data(filename)

In [9]:
print(f'Full doc text length: {len(full_doc_text)}')

Full doc text length: 31802


In [10]:
Lines =get_chunks(full_doc_text,500)

In [11]:
len(Lines)

73

In [12]:
type(Lines)

list

In [13]:
model = SentenceTransformer(MODEL_NAME)

In [14]:
embeddings_all = model.encode(Lines,show_progress_bar=True)

Batches: 100%|██████████| 3/3 [00:09<00:00,  3.24s/it]


In [15]:
len(embeddings_all)

73

In [16]:
embeddings_all[4].tolist()[:5]

[0.03996901214122772,
 -0.038540229201316833,
 0.016151603311300278,
 -0.028872955590486526,
 -0.0932854413986206]

In [17]:
counter = 0

In [18]:
input_data = []

In [19]:
for line in Lines:
    d = {}
    d['id'] = str(counter)
    d['line'] = line
    d['embedding'] = embeddings_all[counter].tolist()
    d['filename'] = FILE_PATH.split('/')[-1]
    counter = counter + 1
    input_data.append(d)

In [20]:
input_data[0]

{'id': '0',
 'line': 'ANIMAL KINGDOM 3737\nWhen you look around, you will observe different animals with different\nstructures and forms.  As over a million species of animals have been\ndescribed till now, the need for classification becomes all the more\nimportant. The classification also helps in assigning a systematic position\nto newly described species.\n4',
 'embedding': [-0.007113905157893896,
  -0.03274315595626831,
  0.04404618963599205,
  -0.019613116979599,
  0.005531826987862587,
  -0.01970878429710865,
  -0.09384632110595703,
  -0.1018192321062088,
  0.053117722272872925,
  0.03509262204170227,
  -0.010434729978442192,
  -0.13946926593780518,
  -0.05225949361920357,
  0.04996480047702789,
  -0.004872112534940243,
  -0.021825173869729042,
  -0.002666912507265806,
  -0.008894842118024826,
  -0.05013251304626465,
  -0.028873901814222336,
  0.049069397151470184,
  0.031020402908325195,
  -0.010344062931835651,
  0.049582600593566895,
  -0.1187777891755104,
  -0.00626961095258

In [21]:
# Output embeddings to docVectors.json file
with open("../output/docVectors.json", "w") as f:
    json.dump(input_data, f)