In [6]:
import pymupdf
import nltk
from sentence_transformers import SentenceTransformer,util
nltk.download('punkt_tab')
nltk.download('punkt')
model = SentenceTransformer('all-MiniLM-L6-v2')
import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\romi3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\romi3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
#load pdf
def load_pdf(file_path):
    text=""
    doc=pymupdf.open(file_path)
    for page in doc:
        text+=page.get_text()+"\n"
    return text

In [8]:
#split text
def split_text(text, chunk_size):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        current_chunk.append(sentence)
        current_length += len(sentence)
        if current_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


In [9]:
#searching similarities for questions and chunks 
def search_chunks(chunks, query):
    chunk_embeddings = model.encode(chunks, convert_to_tensor=True)
    query_embedding = model.encode([query], convert_to_tensor=True)
    similarities = util.cos_sim(query_embedding, chunk_embeddings)[0]
    best_idx = similarities.argmax().item()
    return chunks[best_idx]

In [10]:
#generating answers using gemini api
import google.generativeai as genai
api_key = "AIzaSyAisqPI5Sh22sLnB9o1_CAE-PfT6r2VXMI"
genai.configure(api_key=api_key)

def generate_answer(query, retrieved_text):
   
    model = genai.GenerativeModel(model_name="gemini-1.5-pro-002")

    prompt = f"""
    You are an AI assistant that answers questions based on provided text.
    Here is some relevant information:
    
    {retrieved_text}

    Based on the above, answer the following question:
    {query}
    """
    response = model.generate_content(prompt)

    return response.text

In [11]:
#summery
def summerize(text):
    sentences=text.split('. ')
    sentence_embeddings=model.encode(sentences)
    scores=util.cos_sim(sentence_embeddings,sentence_embeddings)
    central_sentences = sorted(
        range(len(scores)),
        key=lambda i: sum(scores[i]),
        reverse=True
    )[:3]
    summary = '. '.join([sentences[i] for i in central_sentences])
    return summary


In [12]:
def main():
    file_path = input("Enter the path to the PDF file: ")
    text = load_pdf(file_path)
    chunk_size=400
    chunks = split_text(text, chunk_size)
    summery=summerize(text)
    print("Pdf summery:")
    print(generate_answer("summarize",summery))
    query = input("Enter your question: ")
    best_chunk = search_chunks(chunks, query)
    gen_answer=generate_answer(query,best_chunk)
    print("The answer to your question is:")
    print(gen_answer)
if __name__ == "__main__":
    main()

Pdf summery:
Artificial intelligence (AI) aims to create machines that mimic human intelligence.  Machine learning (ML), a key branch of AI, allows computers to learn from data and improve without explicit programming.  Explainable AI (XAI) focuses on making AI models transparent and understandable.

The answer to your question is:
The three primary types of machine learning are:

1. Supervised Learning
2. Unsupervised Learning
3. Reinforcement Learning

