In [6]:
from langchain_community.document_loaders import PyPDFLoader

FILE_PATH = r'/Users/sahiljain/Data Science/Course GenAI/Projects/Forecasting Models.pdf'

loader = PyPDFLoader(FILE_PATH)

# load once
pages = loader.load()
print(f"Total pages: {len(pages)}")



Total pages: 138


In [7]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings

# create embeddings model (used by SemanticChunker internally)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# initialize semantic chunker
splitter = SemanticChunker(embeddings)

# apply to PDF pages
docs = splitter.split_documents(pages)

print(f"After semantic chunking: {len(docs)} chunks")


After semantic chunking: 345 chunks


In [8]:
len(docs)

345

In [9]:
import os
pinecone_api_key=os.getenv("PINECONE_API_KEY")

from pinecone import Pinecone, ServerlessSpec
pc=Pinecone(api_key=pinecone_api_key)

In [10]:
index_name="personal"

pc.has_index("personal")

True

In [11]:
# delete index
pc.delete_index(index_name)

In [12]:
# Generate embedding for a dummy string to find index size
vec = embeddings.embed_query("hello world")

print(f"Embedding length (dimension): {len(vec)}")

Embedding length (dimension): 1536


In [13]:
#creating a index
if not pc.has_index(index_name):
    pc.create_index(
    name=index_name,
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws",region="us-east-1")    
)

In [14]:
#loading the index
index=pc.Index(index_name)

from langchain_pinecone import PineconeVectorStore

vector_store=PineconeVectorStore(index=index,embedding=embeddings)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
vector_store.add_documents(docs)

['6c4beff8-3cc2-463a-a85e-4c01ad8e8190',
 'ed120190-4c40-4d5b-9a51-9b92b1bcccb3',
 'fbe285b9-5917-4076-9779-e43015267ec8',
 '7f84b716-d264-4cde-b836-8d65e5ca32f8',
 '95b6efd0-2ac9-4614-849c-041a9b7b8e35',
 'e4bab71f-62a1-40c3-b08d-639ff4c2a2e9',
 '36e54145-7fbe-46a2-8196-33f5be0dacd0',
 '63846fbd-d7b9-45f4-8dc8-c24b27b31a3e',
 'e7bc8fdf-96ae-433a-97f2-cd26face5d87',
 '900625c7-9b4e-4ca9-bb05-97d4130f0cf5',
 '4722848e-ced6-49f3-b8a2-5732d70c78c7',
 '4fb83dad-ed3a-4462-8484-fea4a57deb1e',
 'e9298ed1-f401-4cc0-a23e-9faaf2d20e56',
 '536ab626-545a-415b-be7c-59458ff9eee8',
 '5d08ef1f-9989-4cdc-a913-b5e7e8bea294',
 '538a2b31-a529-473a-a495-544a785a1258',
 '13530464-e221-4643-a5fd-5cf3fbc1abda',
 'ec1a413f-d6b4-4fcb-860b-1f32e757c8e2',
 'f9dce1da-fd00-472d-bee8-b31be9f4f6bc',
 'e825fee9-b069-4579-ad3f-6c7127630eec',
 '2d9f1623-a4ab-4ddd-952b-ad1060650e60',
 '099cdcb4-ac48-489e-b480-8f5d096b5daf',
 '58b053ff-67f9-4bd6-974f-83d657205b59',
 'fa41d00c-ef29-49f6-99a5-1beeeb96f3d0',
 '590202f1-fac1-

In [16]:
# Check how many vectors are in Pinecone
index_stats = vector_store._index.describe_index_stats()
print(f"Vectors in Pinecone: {index_stats['total_vector_count']}")

Vectors in Pinecone: 0


In [17]:
vector_store

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1361b9400>

In [18]:
retriever=vector_store.as_retriever()

In [19]:
retriever.invoke("what is arima model")

[]

In [20]:
# 1) Prompt template that forces readable output
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
import textwrap

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "You are an assistant. Use the following CONTEXT to answer the QUESTION.\n\n"
        "CONTEXT:\n{context}\n\n"
        "QUESTION:\n{question}\n\n"
        "INSTRUCTIONS:\n"
        "- Answer concisely and clearly in short paragraphs (2-6 lines each).\n"
        "- If there are examples or steps, use bullet points.\n"
        "- At the end, add a 'SOURCES' section listing source page numbers (if available).\n\n"
        "ANSWER:\n"
    )
)

# 2) LLM (Google model)
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

# 3) Retriver 
# (use the one already built from vector store)

# 4) Build RetrievalQA chain with the custom prompt
qa_chain = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff",                      # simple chain that stuffs context into prompt
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

# 5) Run a query and pretty-print result + sources
query = "What is Arima model"
result = qa_chain(query)   

# result is usually a dict when return_source_documents=True
answer_text = result["result"] if isinstance(result, dict) else result
source_docs = result.get("source_documents", []) if isinstance(result, dict) else []

# Pretty format the answer (wrap lines)
print("\n📌 Answer:\n")
print(textwrap.fill(answer_text.strip(), width=100))

# Show sources neatly
print("\n📚 Sources:")
for i, doc in enumerate(source_docs, start=1):
    page = doc.metadata.get("page", "Unknown")
    preview = doc.page_content.strip().replace("\n", " ")[:300]
    print(f"{i}. Page: {page} — {preview}...")


  result = qa_chain(query)



📌 Answer:

An ARIMA model (Autoregressive Integrated Moving Average) is a statistical model used for analyzing
and forecasting time series data.  It combines autoregressive (AR), integrated (I), and moving
average (MA) components to capture different patterns in the data.  The 'I' component handles
differencing to make the data stationary, meaning its statistical properties don't change over time.
The AR component models the relationship between the current value and past values of the series.
The MA component models the relationship between the current value and past forecast errors.  The
order of each component (p, d, q) specifies the number of past values and errors used in the model,
denoted as ARIMA(p,d,q).  Choosing the appropriate order is crucial for accurate modeling.  Choosing
the correct ARIMA model involves identifying the appropriate (p,d,q) values. This often involves
analyzing autocorrelation and partial autocorrelation functions (ACF and PACF) of the data, and
potentia

In [21]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
import textwrap

# 1) Prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "You are an assistant. Use the following CONTEXT to answer the QUESTION.\n\n"
        "CONTEXT:\n{context}\n\n"
        "QUESTION:\n{question}\n\n"
        "INSTRUCTIONS:\n"
        "- Answer concisely and clearly in short paragraphs (2-6 lines each).\n"
        "- If there are examples or steps, use bullet points.\n"
        "- At the end, add a 'SOURCES' section listing source page numbers (if available).\n\n"
        "ANSWER:\n"
    )
)

# 2) LLM
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

# 3) Retriever with MMR
retriever = vector_store.as_retriever(
    search_type="mmr",             # use Maximal Marginal Relevance
    search_kwargs={
        "k": 5,                   # total number of candidates to fetch
        "fetch_k": 30,             # how many to initially fetch before reranking
        "lambda_mult": 0.5         # tradeoff: relevance (0) vs diversity (1)
    }
)

# 4) Build RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

# 5) Run a query
query = "What is Arima model"
result = qa_chain(query)

answer_text = result["result"] if isinstance(result, dict) else result
source_docs = result.get("source_documents", []) if isinstance(result, dict) else []

# Pretty format the answer
print("\n📌 Answer:\n")
print(textwrap.fill(answer_text.strip(), width=100))

# Show sources neatly
print("\n📚 Sources:")
for i, doc in enumerate(source_docs, start=1):
    page = doc.metadata.get("page", "Unknown")
    preview = doc.page_content.strip().replace("\n", " ")[:300]
    print(f"{i}. Page: {page} — {preview}...")



📌 Answer:

An ARIMA (Autoregressive Integrated Moving Average) model is a statistical model used for time
series analysis.  It's defined by three parameters: p, d, and q.  'p' represents the order of the
autoregressive (AR) part, 'd' the degree of differencing (I), and 'q' the order of the moving
average (MA) part.  The model combines autoregressive, integrated, and moving average components to
capture patterns in time-dependent data.   The model is fitted to data using techniques such as
minimizing the AICc (Corrected Akaike Information Criterion) and stepwise search to find optimal
values for p, d, and q.  This involves iteratively adjusting the parameters and evaluating the
model's fit until the best model is found.   The `Arima` function in R uses a specific formulation
for implementing this model.  Kalman smoothing can be applied to improve estimation and handle
missing data or seasonal adjustments.   SOURCES: pages 73, 74, 79, 177

📚 Sources:
1. Page: 2.0 — . . . . . . . . . . .

In [4]:
query = "What is Arima model"

In [22]:
from docx import Document

# Create new Word doc
doc = Document()

# Add Title
doc.add_heading("LLM Answer Report", level=1)

# Add Question
doc.add_heading("Question:", level=2)
doc.add_paragraph(query)

# Add Answer
doc.add_heading("Answer:", level=2)
doc.add_paragraph(answer_text)

# Add Sources
doc.add_heading("Sources:", level=2)
for i, doc_source in enumerate(source_docs, start=1):
    page = doc_source.metadata.get("page", "Unknown")
    preview = doc_source.page_content.strip().replace("\n", " ")[:200]
    doc.add_paragraph(f"{i}. Page {page} — {preview}...")

# Save the Word file
doc.save("LLM_Answer_Report.docx")
print("✅ Saved output to LLM_Answer_Report.docx")


✅ Saved output to LLM_Answer_Report.docx
