In [4]:
from langchain_unstructured import UnstructuredLoader
loader_local = UnstructuredLoader(
    file_path="nlp.pdf",
    strategy="hi_res",
)
docs_local = []
for doc in loader_local.lazy_load():
    docs_local.append(doc)

  from .autonotebook import tqdm as notebook_tqdm
INFO: pikepdf C++ to Python logger bridge initialized
INFO: Reading PDF for file: nlp.pdf ...


In [5]:
len(docs_local)

313

In [6]:
print(docs_local[0].page_content)

1. What is Natural Language Processing (NLP)?


In [7]:
first_page_docs = [doc for doc in docs_local if doc.metadata.get("page_number") == 1]

for doc in first_page_docs:
    print(doc.page_content)

1. What is Natural Language Processing (NLP)?
Answer: Natural Language Processing (NLP) is a field of Artificial Intelligence (AI) concerned with the interactions between computers and human (natural) languages. It focuses on enabling computers to understand, interpret, and generate human language in a way that is both meaningful and useful.
2. Mention any two real-world applications of NLP.
Answer:
• Sentiment Analysis: Determining the emotional tone or attitude expressed in text, used for market research, brand monitoring, etc.
• Chatbots and Conversational AI: Building interactive agents that can engage in conversations with humans, provide customer service, answer questions, and more.
3. Define empirical laws in the context of NLP.
Zipf’s Law: When words are ranked according to their frequencies in a large enough collection of texts and then the frequency is plotted against the rank, the result is a logarithmic curve.
Heap's law states that the number of unique words V in a collect

In [8]:
section_data =[]
section =""
for docs in docs_local:
    if docs.metadata.get("category") == "Title":
        
        section_data.append(section)
        section =""
        section+= docs.page_content + "\n"
        
    else:
        section += docs.page_content + "\n"

       
   

In [9]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

# Initialize
model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

# Suppose section_data is a list of section texts
all_chunks = []
raw_chunks =[]

for i, section in enumerate(section_data):
    if len(section) > 0:
        chunks = text_splitter.split_text(section)
        print(f"Section {i} has {len(chunks)} chunks")
        for j, chunk in enumerate(chunks):
            # Optional: Add metadata like section number
            all_chunks.append(Document(
                page_content=chunk,
                metadata={"section_id": i, "chunk_id": j}
            ))
            raw_chunks.append(chunk)

# Create the vector index
embeddings = model.embed_documents(raw_chunks)

""" vectorindex = FAISS.from_documents(all_chunks, embeddings) """




INFO: Use pytorch device_name: cpu
INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Section 1 has 1 chunks
Section 2 has 1 chunks
Section 3 has 1 chunks
Section 4 has 1 chunks
Section 5 has 1 chunks
Section 6 has 1 chunks
Section 7 has 1 chunks
Section 8 has 6 chunks
Section 9 has 2 chunks
Section 10 has 1 chunks
Section 11 has 1 chunks
Section 12 has 3 chunks
Section 13 has 1 chunks
Section 14 has 1 chunks
Section 15 has 1 chunks
Section 16 has 2 chunks
Section 17 has 3 chunks
Section 18 has 1 chunks
Section 19 has 2 chunks
Section 20 has 1 chunks
Section 21 has 2 chunks
Section 22 has 1 chunks
Section 23 has 2 chunks
Section 24 has 1 chunks
Section 25 has 1 chunks
Section 26 has 5 chunks
Section 27 has 5 chunks
Section 28 has 4 chunks
Section 29 has 2 chunks
Section 30 has 1 chunks
Section 31 has 1 chunks
Section 32 has 1 chunks
Section 33 has 1 chunks
Section 34 has 3 chunks
Section 35 has 1 chunks
Section 36 has 1 chunks
Section 37 has 1 chunks
Section 38 has 1 chunks
Section 39 has 2 chunks
Section 40 has 2 chunks
Section 41 has 2 chunks
Section 42 has 3 chunks
S

' vectorindex = FAISS.from_documents(all_chunks, embeddings) '

In [10]:
from sklearn.cluster import KMeans
num_topics = 10
kmeans = KMeans(n_clusters=num_topics, random_state=42)
labels = kmeans.fit_predict(embeddings)


In [11]:
import os
import dotenv
dotenv.load_dotenv()
from langchain_groq import ChatGroq
llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0,
    max_tokens=700,
    timeout=None,
    max_retries=2,
) 

In [12]:
import re
cluster_topic_titles = {}
for cluster_id in set(labels):
    rep_idx = list(labels).index(cluster_id)
    rep_chunk = raw_chunks[rep_idx]

    # Ask LLM to name this topic
    # Updated prompt
    prompt = (
        f"Give a very short and clear title for the following topic content.\n"
        f"Just return the title. No explanations, no quotes, no alternatives, no extra text.\n\n"
        f"{rep_chunk}"
    )
    raw_title = llm.invoke(prompt).content.strip()
    clean_title = re.sub(r'^["“”‘’\'*]*|["“”‘’\'*.:]*$', '', raw_title)  # trim quotes, punctuation
    clean_title = re.sub(r'^(Topic Title|Title)\s*[:\-]\s*', '', clean_title, flags=re.IGNORECASE)
    clean_title = clean_title.split("\n")[0].strip()
    cluster_topic_titles[cluster_id] = clean_title
    
labeled_chunks = []
for i, chunk_text in enumerate(raw_chunks):
    chunk_meta = {
        "section_id": all_chunks[i].metadata["section_id"],
        "chunk_id": all_chunks[i].metadata["chunk_id"],
        "cluster_id": int(labels[i]),
        "topic": cluster_topic_titles[labels[i]]
    }
    labeled_chunks.append({
        "text": chunk_text,
        "embedding": embeddings[i],
        "metadata": chunk_meta
    })

INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


In [13]:
vectorstore = FAISS.from_texts(
    texts=[chunk["text"] for chunk in labeled_chunks],
    embedding=model,
    metadatas=[chunk["metadata"] for chunk in labeled_chunks]
)

# === Done! You can now use vectorstore.as_retriever() ===
retriever = vectorstore.as_retriever(search_kwargs=dict(k=5))

INFO: Loading faiss with AVX2 support.
INFO: Successfully loaded faiss with AVX2 support.
INFO: Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.


In [12]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

prompt_template = """You are a helpful assistant. 
Use ONLY the following context to answer the question. 
Do NOT use any prior knowledge. 
If the answer is not in the context, respond with "The answer is not available in the provided context."

Context:
{context}

Question:
{question}

Answer:"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)


In [13]:
query = "what is used to  capture the frequency of individul words in a document?"
result = qa_chain.invoke({"query": query})

INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


In [14]:
print("Answer:", result['result'])

Answer: Tokenization. 

Tokenization breaks down text into individual words or phrases, known as tokens. This process captures the frequency of individual words in a document.


In [15]:
for doc in result["source_documents"]:
    print("Chunk:", doc.page_content)
    print("Section ID:", doc.metadata.get("section_id"))
    print("Chunk ID:", doc.metadata.get("chunk_id"))


Chunk: Example:
Consider a corpus such as a collection of news articles. In many English texts, the word “the” is the most frequent. Suppose “the” occurs 10,000 times; then Zipf’s Law suggests that the second most common word might occur roughly 5,000 times, the third about 3,300 times, and so on. Although real data rarely follow the law perfectly (especially at the very high and low frequency ends), the overall pattern is striking. This regularity has been observed across languages and types of text .
Section ID: 40
Chunk ID: 0
Chunk: Tokenization: This breaks down text into individual words or phrases, known as tokens. This is often the first step in text processing.
Section ID: 27
Chunk ID: 4
Chunk: 17. Describe the process of text pre-processing with suitable examples.
Text cleansing Remove faumbers. symbols, marks Creatirg Document Stemming = Keyword Matrix ioKm) Creating 3 corpus Tokenization Removing stop words
Text preprocessing typically involves the following steps:
• Lowerca

In [45]:
print(result["source_documents"])

[Document(id='a8d5a770-c976-484a-99a9-20a50b086c15', metadata={'section_id': 40, 'chunk_id': 0, 'cluster_id': 5, 'topic': '"Empirical Laws in NLP"'}, page_content='Example:\nConsider a corpus such as a collection of news articles. In many English texts, the word “the” is the most frequent. Suppose “the” occurs 10,000 times; then Zipf’s Law suggests that the second most common word might occur roughly 5,000 times, the third about 3,300 times, and so on. Although real data rarely follow the law perfectly (especially at the very high and low frequency ends), the overall pattern is striking. This regularity has been observed across languages and types of text .'), Document(id='7fad23d3-7d3d-4511-8eda-a0e7558298d2', metadata={'section_id': 27, 'chunk_id': 4, 'cluster_id': 0, 'topic': '"Introduction to Natural Language Processing (NLP)"'}, page_content='Tokenization: This breaks down text into individual words or phrases, known as tokens. This is often the first step in text processing.'), D

In [14]:
# Get all stored documents from FAISS
all_docs = vectorstore.similarity_search("placeholder", k=len(vectorstore.docstore._dict))

# Extract and print all unique topics
topics = set()
for doc in all_docs:
    topic = doc.metadata.get("topic")
    if topic:
        topics.add(topic)


print("Unique Topics:")
for topic in sorted(topics):
    print("-", topic)


Unique Topics:
- Applications of NLP
- Data Retrieval
- Definition
- Empirical Laws in NLP
- Finite-State Methods in Morphology
- Minimum Character Edits
- Natural Language Processing (NLP)
- Role of Smoothing in Language Models
- Text Preprocessing Techniques
- Types of Ambiguity in NLP


In [15]:
def get_chunks_by_topic(vectorstore, topic_query):
    all_docs = vectorstore.similarity_search("placeholder", k=len(vectorstore.docstore._dict))
    
    topic_chunks = []
    for doc in all_docs:
        if doc.metadata.get("topic", "").lower() == topic_query.lower():
            topic_chunks.append(doc.page_content)
    
    return topic_chunks


In [16]:
topic = "Data Retrieval"  # or input("Enter topic: ")
chunks = get_chunks_by_topic(vectorstore, topic)

print(f"\nFound {len(chunks)} chunks for topic '{topic}':\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}:\n{chunk}\n")



Found 18 chunks for topic 'Data Retrieval':

Chunk 1:
Retrieval

Chunk 2:
iii. N-gram models struggle to capture longer-distance context clues. It has been shown that after 6-grams, the gain in performance is limited. This is unfavorable in tasks where that is a particularly desirable feature and necessity.

Chunk 3:
3.Handling data sparsity: Language models often encounter rare or infrequent n-grams that have limited or no training examples. Smoothing techniques estimate probabilities for these unseen events by redistributing probability mass from observed events. This helps in more accurately modeling the likelihood of unseen or infrequent n-grams.

Chunk 4:
speech recognition, text prediction

Chunk 5:
9. Write the formula for calculating bigram probabilities.
In a bigram model, the probability of a word wi given its preceding word wi−1 is calculated using the maximum likelihood estimate (MLE):
Here:
P(w; | w;1) — count(wj_1, w;) count (w;_1)
• count
(wi−1,wi) is the number of time

In [34]:
from langchain.prompts import PromptTemplate

template = """
You are an expert educational content designer.

Your task is to help retrieve **realistic educational visuals** from the web for the topic **"{topic}"**.

Instructions:
- You will be given text chunks related to the topic.
- Analyze all chunks holistically.
- Identify 1 to 3 key visualizable concepts.
- Based on the concepts, suggest **1 to 3 visual descriptors** that are suitable for web image retrieval.
- These images will be fetched from sources like **DuckDuckGo**
- Later, a separate model (like BLIP2) will describe the retrieved image and generate audio captions — so your job is just to suggest the most **searchable visual ideas**.
-- If only 1 or 2 are needed, output fewer.

Important Notes:
- Your descriptors must be **web-search friendly**, realistic, and likely to return good visuals.
- Do NOT suggest fictional or AI-specific styles like “a digital painting” or “ultra-detailed 4K illustration”.
- You **can suggest things like graphs, real-world scenes, physical experiments**, etc., if they are commonly found online.
- If you mention a physics diagram or formula chart, clarify that it's **just a reference to what's expected to be found** on the web.

Return format strictly as:
{{
  "image1": "<descriptor 1>",
  "image2": "<descriptor 2>",
  "image3": "<descriptor 3>"
}}

Rules:
- If one image is enough, return only "image1".
- Do not include any narration or explanation — only the descriptors.
- Do not use JSON formatting or code — just follow the shown format.

Content Chunks:
{chunks}
"""

prompt = PromptTemplate.from_template(template)


In [35]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate

# Example descriptor generation shots
examples = [
  {
    "topic": "Photosynthesis",
    "chunks": "Photosynthesis is the process by which green plants use sunlight to make food from carbon dioxide and water. Oxygen is released as a byproduct.",
    "descriptors": [
      "Diagram of photosynthesis in plants",
      "Chloroplast structure and function",
      "Photosynthesis chemical reaction chart"
    ]
  },
  {
    "topic": "Newton's Laws of Motion",
    "chunks": "Newton's three laws describe how objects move and interact with forces. The first law is about inertia, second about force and acceleration, and third about action and reaction.",
    "descriptors": [
      "Illustration of Newton's 3 laws with examples",
      "Force and acceleration graph",
      "Action-reaction force diagram"
    ]
  },
  {
    "topic": "Acids and Bases",
    "chunks": "Acids release H+ ions while bases release OH- ions. They are measured on the pH scale. Neutralization reactions occur when acids and bases combine.",
    "descriptors": [
      "pH scale with common substances",
      "Acid-base titration curve",
      "Neutralization reaction diagram"
    ]
  },
  {
    "topic": "Mitosis",
    "chunks": "Mitosis is the process of cell division in which a single cell divides into two identical daughter cells. It includes stages like prophase, metaphase, anaphase, and telophase.",
    "descriptors": [
      "Mitosis stages under microscope",
      "Cell cycle diagram with mitosis",
      "Mitosis vs meiosis comparison chart"
    ]
  },
  {
    "topic": "Ohm's Law",
    "chunks": "Ohm's Law states that the current through a conductor is directly proportional to voltage and inversely proportional to resistance.",
    "descriptors": [
      "Ohm's law triangle diagram",
      "Current-voltage-resistance graph",
      "Simple circuit showing Ohm's Law"
    ]
  },
  {
    "topic": "Periodic Table",
    "chunks": "The periodic table organizes elements based on atomic number and properties. Groups and periods reveal patterns in reactivity and structure.",
    "descriptors": [
      "Modern periodic table labeled",
      "Group trends in periodic table",
      "Periodic table block diagram"
    ]
  },
  {
    "topic": "DNA Structure",
    "chunks": "DNA is composed of nucleotides forming a double helix. It carries genetic instructions using base pairs A-T and G-C.",
    "descriptors": [
      "DNA double helix 3D model",
      "Base pairing in DNA strands",
      "Nucleotide structure diagram"
    ]
  },
  {
    "topic": "Chemical Bonding",
    "chunks": "Atoms bond to achieve stable electron configurations. Common types include ionic, covalent, and metallic bonding.",
    "descriptors": [
      "Ionic vs covalent bonding diagram",
      "Lewis structure examples",
      "Molecular structure of water"
    ]
  },
  {
    "topic": "Thermodynamics",
    "chunks": "Thermodynamics studies energy transfer. Laws of thermodynamics describe conservation of energy and entropy changes.",
    "descriptors": [
      "Laws of thermodynamics flowchart",
      "Heat engine efficiency diagram",
      "Entropy change vs temperature graph"
    ]
  },
  {
    "topic": "Human Digestive System",
    "chunks": "The digestive system breaks down food into nutrients. Key organs include mouth, stomach, intestines, liver, and pancreas.",
    "descriptors": [
      "Human digestive system labeled diagram",
      "Process of digestion infographic",
      "Enzyme function in digestion chart"
    ]
  }
]


In [36]:

topic = "Data Retrieval"  # or input("Enter topic: ")
chunks = get_chunks_by_topic(vectorstore, topic)
# Create individual prompt template for each example
example_prompt = PromptTemplate.from_template(
    "Topic: {topic}\nChunks: {chunks}\nDescriptors: {descriptors}"
)


descriptor_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    
    suffix=template,
    input_variables=["topic", "chunks"]
)


In [None]:
final_prompt =descriptor_prompt.format(topic=topic, chunks=chunks)
response = llm.predict(final_prompt)

  response = llm.predict(final_prompt)
INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


In [38]:
print(response)

{
  "image1": "N-gram language model diagram with smoothing techniques",
  "image2": "Perplexity formula with language model prediction graph",
  "image3": "N-gram model vs long-range dependency comparison chart"
}


In [24]:
import json
import re
import requests
import pyttsx3
import base64
import os
from dotenv import load_dotenv
import logging
logging.getLogger("comtypes").setLevel(logging.CRITICAL)

# Load environment variables
load_dotenv()

# Example response string (replace with your real response)


# Initialize TTS engine
engine = pyttsx3.init()
voices = engine.getProperty('voices')
for voice in voices:
    if 'male' in voice.name.lower():
        engine.setProperty('voice', voice.id)
        break

# Extract JSON block
match = re.search(r"\{\s*\"image1\".*\}", response, re.DOTALL)

if match:
    json_str = match.group(0)
    print("🔍 Extracted JSON:\n", json_str)

    
    parsed = json.loads(json_str)

    for i, (key, value) in enumerate(parsed.items(), 1):
        descriptor = value["descriptor"]
        speech_text = value["speech"]

        # Image generation request to local Stable Diffusion
        payload = {
            "prompt": f"Educational diagram: {descriptor}",
            "steps": 40,  # Balanced quality vs speed
            "sampler_name": "Euler a",  # Fast and light sampler
            "cfg_scale": 6,
            "seed": 1958844762,
            "width": 512,
            "height": 512
        }

        try:
            response = requests.post("http://127.0.0.1:7860/sdapi/v1/txt2img", json=payload)
            r = response.json()
        except Exception as e:
            print(f"❌ Error during request: {e}")
            continue

        if "images" not in r:
            print(f"❌ Failed to generate image{i}. Response:\n{r}")
            continue

        image_data = r["images"][0]
        if "," in image_data:
            b64_part = image_data.split(",", 1)[1]
        else:
            b64_part = image_data  # Assume it's already base64

        with open(f"image{i}.png", "wb") as f:
            f.write(base64.b64decode(b64_part))

        # Save speech
        engine.save_to_file(speech_text, f"image{i}.mp3")
        engine.runAndWait()

        print(f"✅ Saved image{i}.png and image{i}.mp3")
else:
    print("❌ JSON block not found.")


🔍 Extracted JSON:
 {
  "image1": {
    "descriptor": "A simple diagram showing a series of interconnected circles, each representing a word in a sentence. The circles are arranged in a sequence, with each circle connected to the previous one by a line. The lines are thicker and more prominent as they move forward in the sequence, indicating the increasing importance of the preceding words in predicting the next word.",
    "speech": "In this image, you can see how N-gram language models work. They predict the likelihood of a word sequence by looking at the preceding words. The thicker lines show how the model relies more heavily on the previous words as it moves forward in the sequence. This is the core idea behind N-gram models: capturing statistical patterns in language to estimate the probability of sentences."
  },
  "image2": {
    "descriptor": "A simple graph showing a line that starts high and then drops down, representing the perplexity of a language model. The line is labeled

In [92]:

from gtts import gTTS

speech_text = """N-gram language models are probabilistic models that predict the likelihood of a word sequence. 
They work by analyzing the statistical patterns in language and estimating the probability of a word based on the preceding N-1 words. 
This helps in tasks like speech recognition, translation, and more."""

tts = gTTS(text=speech_text, lang='en')
tts.save("speech1.mp3")

print("✅ Audio saved as speech1.mp3")


✅ Audio saved as speech1.mp3


In [None]:
import pyttsx3

engine = pyttsx3.init()
voices = engine.getProperty('voices')

# Select male voice (usually index 0 or try looping to find one)
for voice in voices:
    if 'male' in voice.name.lower():
        engine.setProperty('voice', voice.id)
        break

engine.save_to_file("This is a male voice example.", "male_voice.mp3")
engine.runAndWait()


INFO: Could not import comtypes.gen, trying to create it.
INFO: Created comtypes.gen directory: 'c:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\comtypes\gen'
INFO: Writing __init__.py file: 'c:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\comtypes\gen\__init__.py'
INFO: Using writeable comtypes cache directory: 'c:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\comtypes\gen'
INFO: Could not import comtypes.gen._C866CA3A_32F7_11D2_9602_00C04F8EE628_0_5_4: No module named 'comtypes.gen._C866CA3A_32F7_11D2_9602_00C04F8EE628_0_5_4'
INFO: # Generating comtypes.gen._C866CA3A_32F7_11D2_9602_00C04F8EE628_0_5_4
INFO: # Generating comtypes.gen.SpeechLib
INFO: Could not import comtypes.gen._00020430_0000_0000_C000_000000000046_0_2_0: No module named 'comtypes.gen._00020430_0000_0000_C000_000000000046_0_2_0'
INFO: # Generating comtypes.gen._00020430_0000_0000_C000_000000000046_0_2_0
INFO: # Generating comtypes.gen.stdole
