In [1]:
import re
import os
filenames = ['AR22.pdf', 'AR21.pdf', 'AR20.pdf','AR19.pdf', 'AR18-19.pdf', 'AR17-18.pdf','AR16-17.pdf','AR15-16.pdf','AR14-15.pdf','AR13-14.pdf']

In [1]:
import fitz
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def chunk_text_with_sentences(preprocessed_text, max_tokens=105):
    tokens = word_tokenize(preprocessed_text)
    chunks = []
    current_chunk = []

    for token in tokens:
        current_chunk.append(token)
        if len(current_chunk) >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

    if current_chunk:  # If there are remaining tokens
        chunks.append(" ".join(current_chunk))

    return chunks
    

def extract_and_preprocess_from_pdf(pdf_path):
    try:
        # Open the PDF file
        doc = fitz.open(pdf_path)
        # Extract text from each page
        text = ""
        for page in doc:
            text += page.get_text()
        
        # Remove unwanted characters
        text = re.sub(r'[^\w\s\d]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()

        return text
    
    except Exception as e:
            print(f"Failed to extract text from {pdf_path}: {e}")

ModuleNotFoundError: No module named 'fitz'

In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2Model

def embed(text_chunks, nums, model_name='gpt2', device='cpu'):
    # Load pre-trained GPT-2 tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2Model.from_pretrained(model_name)
    # Move model to specified device
    model.to(device)
    # Set model to evaluation mode
    model.eval()

    # Tokenize and encode text chunks
    encoded_chunks = []
    for chunk in text_chunks:
        # Tokenize the chunk
        tokens = tokenizer.encode(chunk)
        nums.append(len(tokens))
        # Truncate tokens if exceeds max length
        if len(tokens) > tokenizer.model_max_length:
            tokens = tokens[:tokenizer.model_max_length]
        # Convert tokens to PyTorch tensor and add to encoded_chunks list
        encoded_chunks.append(torch.tensor(tokens).unsqueeze(0))

    # Generate embeddings
    embeddings = []
    with torch.no_grad():
        for chunk in encoded_chunks:
            chunk = chunk.to(device)
            outputs = model(chunk)
            # Extract hidden states (last hidden state)
            hidden_states = outputs.last_hidden_state
            # Take mean of hidden states across tokens to get chunk embedding
            chunk_embedding = torch.mean(hidden_states, dim=1)
            embeddings.append(chunk_embedding)

    # Concatenate embeddings along the first dimension to get a single tensor
    embeddings_tensor = torch.cat(embeddings, dim=0)
    
    return embeddings_tensor

In [9]:
file_path = "C:\\Users\\DELL\\OneDrive\\Desktop\\LLM1\\webs\\data"
text_chunks = []
nums = []
allembeddings = []
for i in filenames:
    text = extract_and_preprocess_from_pdf(file_path+"\\"+i)
    chunks = chunk_text_with_sentences(text)
    text_chunks.extend(chunks)
    break
nums = []
allembeddings.extend(embed(text_chunks,nums))
print(len(text_chunks))
print(len(allembeddings))

685
685


In [10]:
print(max(nums))
print(len(allembeddings))
print(len(allembeddings[0]))

666
685
768


In [11]:
# storage

import pickle
# import fitz

# Saving allembeddings and token_chunks
with open('allembeddings.pkl', 'wb') as f:
    pickle.dump(allembeddings, f)

with open('text_chunks.pkl', 'wb') as f:
    pickle.dump(text_chunks, f)


In [12]:
import pickle

with open('allembeddings.pkl','rb') as f:
    loaded_allembeddings = pickle.load(f)
with open('text_chunks.pkl', 'rb') as f:
    loaded_textchunks = pickle.load(f)
print("allembeddings=",len(loaded_allembeddings))
print("loaded_textchunks=",len(loaded_textchunks))

allembeddings= 685
loaded_textchunks= 685


In [13]:
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_nearest_embeddings(query_embedding, model_name='gpt2', device='cpu', top_k=5):
    
    with open('allembeddings.pkl','rb') as f:
        embeddings = pickle.load(f)

    # Convert lists of tensors to a single tensor
    # query_embedding = torch.stack(query_embedding).to(device)
    embeddings = torch.stack(embeddings).to(device)

    # Move tensors to the specified device
    query_embedding = query_embedding.to(device)
    embeddings = embeddings.to(device)

    # Compute cosine similarities
    similarities = cosine_similarity(query_embedding.cpu().numpy(), embeddings.cpu().numpy())[0]

    # Find the top_k nearest embeddings
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]
    # top_k_similarities = similarities[top_k_indices]

    return top_k_indices.tolist()

query_nums=[]
query = "Who is the current director of ICAR-CRIDA, and who were the members of the editorial committee for the 2022 annual report?"
# print(embed([query],query_nums)[0])
nearest_ids = find_nearest_embeddings(embed([query],query_nums), model_name='gpt2', device='cpu', top_k=5)
print(nearest_ids)

[392, 4, 119, 619, 93]


In [3]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [11]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# model = GPT2Model.from_pretrained("gpt2")
  
def generate_answer(query: str, context: str, model_name: str = 'gpt2', max_length: int = 1024) -> str:

    # Load the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Prepare the input text
    input_text = f"Context: {context}\n\nQuery: {query}\n\nAnswer:"
    
    # Encode the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate the output
    output_ids = model.generate(input_ids, 
                                max_length=max_length, 
                                num_return_sequences=1,
                                no_repeat_ngram_size=2, 
                                temperature=0.5,
                                top_p=0.9,
                                do_sample=True,
                                pad_token_id=tokenizer.eos_token_id)

    # Decode the output
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Post-process the output to extract the answer part
    # answer = answer.split("Answer:")[1].strip()
    answer = answer.split("Answer:")[1].strip().split("\n")[0]

    # Tokenize the answer into sentences
    sentences = sent_tokenize(answer)

    # Reconstruct the answer without the last incomplete sentence
    complete_answer = ' '.join(sentences[:-1]) if not answer.endswith('.') else answer
    return complete_answer


def get_context(nearest_ids):
    with open('text_chunks.pkl', 'rb') as f:
        loaded_chunks = pickle.load(f)

    # nearest_ids.sort()
    chunks = [loaded_chunks[i] for i in nearest_ids]
    # for i,j in enumerate(chunks):
    #     print(i,j)
    context = " ".join([i for i in chunks])
    # print("context: ", context)
    return context

# print("Query: ",query)
# context = get_context(nearest_ids)
answer = generate_answer("who are you", "context")
print("answer: ", answer)

[nltk_data] Downloading package punkt to /home/sharan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


answer:  
