In [None]:
# Welcoem to Langchain

In [5]:
# Example 1 - Simple Embeddings
# Install
!pip install -q langchain-openai

# Imports
from langchain_openai import OpenAIEmbeddings
from google.colab import userdata
import os

# 1) Put your key once in Colab (in a separate cell):
# from google.colab import userdata
# userdata.set("OPENAI_API_KEY", "sk-...")

# 2) Load key
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

# 3) Create the embeddings model
#   - "text-embedding-3-small": fast, cheap, 1536-dim
#   - "text-embedding-3-large": higher quality, 3072-dim
emb = OpenAIEmbeddings(model="text-embedding-3-small")

# 4) Take input and embed
text = input("Enter text to embed: ").strip()
vec = emb.embed_query(text)  # list[float]

# 5) Show results
print(f"\nText: {text!r}")
print(f"Vector length (dimension): {len(vec)}")
print("First 8 numbers:", [round(x, 6) for x in vec[:8]])


Enter text to embed:  who is Sachin

Text: 'who is Sachin'
Vector length (dimension): 1536
First 8 numbers: [0.057726, -0.000847, 0.011904, -0.00289, 0.005939, -0.045332, 0.051492, 0.049921]


In [6]:
# Exaple 2 - to create / build and save embeddings

In [10]:
# ============================================
# Basic: Create + STORE embeddings (no search)
# ============================================


# Example 2 - Create + STORE embeddings (no search)

# 0) Install deps
!pip install -q langchain-openai faiss-cpu numpy pandas

# 1) Imports & API key
from langchain_openai import OpenAIEmbeddings
from google.colab import userdata
import os, json, numpy as np, pandas as pd
import faiss  # direct FAISS usage

# Load key
api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError(
        "❌ OPENAI_API_KEY not found in Colab userdata.\n"
        "Set it once with:\n"
        "from google.colab import userdata\n"
        "userdata.set('OPENAI_API_KEY', 'sk-...')"
    )
os.environ["OPENAI_API_KEY"] = api_key

# 2) Collect texts (user input or defaults)
raw = input(
    "Enter texts separated by '||' (or press Enter to use defaults):\n"
).strip()

if raw:
    texts = [t.strip() for t in raw.split("||") if t.strip()]
else:
    texts = [
        "MSD is Mahendra Singh Dhoni, a legendary Indian cricket captain.",
        "SRT refers to Sachin Ramesh Tendulkar, the 'God of Cricket'.",
        "Amitabh Bachchan is nicknamed Big B in Indian cinema.",
        "Shah Rukh Khan is popularly called King Khan.",
    ]

print(f"\n📝 {len(texts)} texts to embed.\n")

# 3) Create embeddings (vectors)
emb_model = "text-embedding-3-small"  # 1536-dim; switch to -large for 3072-dim
emb = OpenAIEmbeddings(model=emb_model)

# embed_documents -> list of vectors (one per text)
vectors = emb.embed_documents(texts)  # list[list[float]]
dim = len(vectors[0]) if vectors else 0

print(f"✅ Created {len(vectors)} embeddings with dimension = {dim} (model: {emb_model}).\n")

# 4) SHOW what we stored (table preview)
rows = []
for i, (t, v) in enumerate(zip(texts, vectors), start=1):
    rows.append({
        "id": f"doc-{i}",
        "text": t,
        "dim": len(v),
        "preview(first_8_dims)": [round(x, 6) for x in v[:8]]
    })
df = pd.DataFrame(rows)
print(df.to_string(index=False))

# 5) STORE embeddings plainly (files you can inspect)
#    - vectors.npy: the raw numeric vectors (float32)
#    - texts.jsonl: metadata (id + original text)
vec_arr = np.array(vectors, dtype="float32")
np.save("vectors.npy", vec_arr)
with open("texts.jsonl", "w", encoding="utf-8") as f:
    for i, t in enumerate(texts, start=1):
        f.write(json.dumps({"id": f"doc-{i}", "text": t}, ensure_ascii=False) + "\n")

print("\n💾 Saved raw vectors to ./vectors.npy and text metadata to ./texts.jsonl")

# 6) STORE embeddings in a FAISS index (no search yet)
#    Build an L2 index and add our vectors.
if len(vec_arr) > 0:
    index = faiss.IndexFlatL2(dim)  # exact L2 index
    index.add(vec_arr)              # add all vectors
    print(f"📦 FAISS index created. ntotal = {index.ntotal} vectors.")

    # (Optional) Save FAISS index to disk
    faiss.write_index(index, "index.faiss")
    print("💾 Saved FAISS index to ./index.faiss")

    # (Optional) Show how to reload it later
    reloaded = faiss.read_index("index.faiss")
    print(f"🔁 Reloaded FAISS index. ntotal = {reloaded.ntotal} vectors.")
else:
    print("No vectors created; FAISS index not built.")


Enter texts separated by '||' (or press Enter to use defaults):


📝 4 texts to embed.

✅ Created 4 embeddings with dimension = 1536 (model: text-embedding-3-small).

   id                                                             text  dim                                                               preview(first_8_dims)
doc-1 MSD is Mahendra Singh Dhoni, a legendary Indian cricket captain. 1536    [0.108245, -0.016881, 0.041814, 0.037209, -0.008451, 0.011741, 0.0371, 0.007595]
doc-2     SRT refers to Sachin Ramesh Tendulkar, the 'God of Cricket'. 1536    [0.047046, 0.011721, 0.035615, 0.005706, -0.037799, -0.0189, 0.030132, 0.020781]
doc-3            Amitabh Bachchan is nicknamed Big B in Indian cinema. 1536   [0.0534, -0.022114, 0.005836, 0.058191, -0.063351, -0.003442, 0.025553, 0.047135]
doc-4                    Shah Rukh Khan is popularly called King Khan. 1536 [0.016193, -0.093645, -0.018306, 0.03755, 0.015275, -0.009726, -0.006596, 0.054999]

💾 Saved raw vectors to ./vectors.

In [11]:
# Example 3

In [15]:
# Example 3 - Tiny Search

# ============================================
# Tiny similarity search with FAISS (Colab)
# ============================================

# 0) Install deps (if not already)
!pip install -q langchain-openai faiss-cpu numpy

# 1) Imports & API key
from langchain_openai import OpenAIEmbeddings
from google.colab import userdata
import os, json, numpy as np, faiss, sys

api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    print("❌ OPENAI_API_KEY not found. Set it with:\n"
          "from google.colab import userdata\n"
          "userdata.set('OPENAI_API_KEY', 'sk-...')\n")
    sys.exit(1)
os.environ["OPENAI_API_KEY"] = api_key

# 2) Try loading previously saved artifacts (from the “store” step)
INDEX_PATH = "index.faiss"
TEXTS_PATH = "texts.jsonl"
EMB_MODEL = "text-embedding-3-small"    # must match the model used to build the index

def load_texts_jsonl(path):
    items, ids = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            ids.append(obj.get("id"))
            items.append(obj.get("text"))
    return ids, items

need_rebuild = False
if os.path.exists(INDEX_PATH) and os.path.exists(TEXTS_PATH):
    try:
        index = faiss.read_index(INDEX_PATH)
        doc_ids, texts = load_texts_jsonl(TEXTS_PATH)
        print(f"✅ Loaded FAISS index (ntotal={index.ntotal}) and {len(texts)} texts.")
    except Exception as e:
        print("⚠️ Could not load previous index/metadata. Will rebuild a small demo index.\nReason:", e)
        need_rebuild = True
else:
    need_rebuild = True

# 3) If needed, rebuild a small index (self-contained)
if need_rebuild:
    print("🔧 Rebuilding a small demo index locally...")
    texts = [
        "MSD is Mahendra Singh Dhoni, a legendary Indian cricket captain.",
        "SRT refers to Sachin Ramesh Tendulkar, the 'God of Cricket'.",
        "Amitabh Bachchan is nicknamed Big B in Indian cinema.",
        "Shah Rukh Khan is popularly called King Khan.",
        "Virat Kohli is an Indian cricketer known for batting consistency.",
        "Rohit Sharma is known for explosive batting in limited-overs."
    ]
    emb = OpenAIEmbeddings(model=EMB_MODEL)
    vecs = emb.embed_documents(texts)           # list[list[float]]
    vec_arr = np.array(vecs, dtype="float32")   # (N, dim)
    dim = vec_arr.shape[1]

    index = faiss.IndexFlatL2(dim)              # exact L2 index
    index.add(vec_arr)

    # save artifacts for later use
    faiss.write_index(index, INDEX_PATH)
    with open(TEXTS_PATH, "w", encoding="utf-8") as f:
        for i, t in enumerate(texts, start=1):
            f.write(json.dumps({"id": f"doc-{i}", "text": t}, ensure_ascii=False) + "\n")

    print(f"✅ Built and saved demo index. ntotal={index.ntotal}, dim={dim}")

# 4) Run a tiny search
emb = OpenAIEmbeddings(model=EMB_MODEL)  # same model as used for the index
query = input("\nEnter your search query (e.g., 'Who is King Khan?'): ").strip()
if not query:
    query = "Who is King Khan?"
    print(f"(Using default) {query}")

q_vec = np.array([emb.embed_query(query)], dtype="float32")  # shape (1, dim)

k = 3  # top-k results
distances, indices = index.search(q_vec, k)   # shapes: (1, k), (1, k)
dist_list = distances[0].tolist()
idx_list = indices[0].tolist()

# load texts (if not already loaded)
if 'texts' not in globals():
    _, texts = load_texts_jsonl(TEXTS_PATH)

print(f"\n🔎 Top-{k} results for: {query!r}  (Lower distance = closer match)\n")
for rank, (idx, dist) in enumerate(zip(idx_list, dist_list), start=1):
    if idx == -1:
        continue  # FAISS returns -1 if not enough results
    # idx corresponds to the position of the text used when building the index
    print(f"{rank}. [distance={dist:.4f}]  {texts[idx]}")

print("\n✅ Done.")


✅ Loaded FAISS index (ntotal=4) and 4 texts.

Enter your search query (e.g., 'Who is King Khan?'): who is MSD

🔎 Top-3 results for: 'who is MSD'  (Lower distance = closer match)

1. [distance=0.8382]  MSD is Mahendra Singh Dhoni, a legendary Indian cricket captain.
2. [distance=1.5777]  SRT refers to Sachin Ramesh Tendulkar, the 'God of Cricket'.
3. [distance=1.8358]  Amitabh Bachchan is nicknamed Big B in Indian cinema.

✅ Done.


In [17]:
# Example 4 - Cosine Similarity

In [19]:
# Example 4 - Cosine Similarity
# ============================================================
# Cosine similarity search with FAISS (normalized vectors)
# ============================================================

# 0) Install
!pip install -q langchain-openai faiss-cpu numpy

# 1) Imports & API key
from langchain_openai import OpenAIEmbeddings
from google.colab import userdata
import os, json, numpy as np, faiss, sys

api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    print("❌ OPENAI_API_KEY not found. Set it with:\n"
          "from google.colab import userdata\n"
          "userdata.set('OPENAI_API_KEY', 'sk-...')\n")
    sys.exit(1)
os.environ["OPENAI_API_KEY"] = api_key

EMB_MODEL = "text-embedding-3-small"  # same model must be used for index & queries

# 2) Data to index (you can replace with your own)
texts = [
    "MSD is Mahendra Singh Dhoni, a legendary Indian cricket captain.",
    "SRT refers to Sachin Ramesh Tendulkar, the 'God of Cricket'.",
    "Amitabh Bachchan is nicknamed Big B in Indian cinema.",
    "Shah Rukh Khan is popularly called King Khan.",
    "Virat Kohli is an Indian cricketer known for batting consistency.",
    "Rohit Sharma is known for explosive batting in limited-overs."
]

# 3) Embed and build a cosine index (normalize + IP)
emb = OpenAIEmbeddings(model=EMB_MODEL)

def l2_normalize_rows(mat: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
    return mat / norms

# Embed documents
doc_vecs = emb.embed_documents(texts)            # list[list[float]]
doc_arr = np.array(doc_vecs, dtype="float32")    # (N, dim)
doc_arr = l2_normalize_rows(doc_arr)             # normalize for cosine

dim = doc_arr.shape[1]
index = faiss.IndexFlatIP(dim)                   # inner product ≈ cosine (after normalization)
index.add(doc_arr)                               # add normalized vectors
print(f"✅ Cosine index ready. ntotal={index.ntotal}, dim={dim}")

# Save small metadata file so we can map results back to text
with open("texts.jsonl", "w", encoding="utf-8") as f:
    for i, t in enumerate(texts):
        f.write(json.dumps({"id": i, "text": t}, ensure_ascii=False) + "\n")

# 4) Query loop
def load_texts(path="texts.jsonl"):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            items.append(json.loads(line)["text"])
    return items

stored_texts = load_texts()

print("\n💬 Type your query (e.g., 'Who is King Khan?'). Type 'exit' to quit.")
while True:
    q = input("Query: ").strip()
    if q.lower() in ["exit", "quit"]:
        print("👋 Goodbye!")
        break
    if not q:
        q = "Who is King Khan?"
        print(f"(Using default) {q}")

    # Embed and normalize the query
    q_vec = np.array([emb.embed_query(q)], dtype="float32")
    q_vec = l2_normalize_rows(q_vec)

    k = 3
    sims, idxs = index.search(q_vec, k)   # sims in [0..1] after normalization
    sims, idxs = sims[0], idxs[0]

    print(f"\n🔎 Top-{k} cosine matches for: {q!r}")
    for rank, (i, s) in enumerate(zip(idxs, sims), 1):
        if i == -1: continue
        print(f"{rank}. score={s:.4f}  |  {stored_texts[i]}")
    print("")


✅ Cosine index ready. ntotal=6, dim=1536

💬 Type your query (e.g., 'Who is King Khan?'). Type 'exit' to quit.
Query: who is ABC

🔎 Top-3 cosine matches for: 'who is ABC'
1. score=0.2215  |  Amitabh Bachchan is nicknamed Big B in Indian cinema.
2. score=0.1518  |  MSD is Mahendra Singh Dhoni, a legendary Indian cricket captain.
3. score=0.1147  |  SRT refers to Sachin Ramesh Tendulkar, the 'God of Cricket'.

Query: who is XYZ

🔎 Top-3 cosine matches for: 'who is XYZ'
1. score=0.2052  |  MSD is Mahendra Singh Dhoni, a legendary Indian cricket captain.
2. score=0.1615  |  Amitabh Bachchan is nicknamed Big B in Indian cinema.
3. score=0.1603  |  Virat Kohli is an Indian cricketer known for batting consistency.

Query: exit
👋 Goodbye!


In [20]:
# Embedding Models + LLM => Retrival

In [21]:
# Plug in LLM => 1. Input (Text/data) 2. Prompts 3. Output (Response) Retrieval

# Input - Text -> Build Index (FAISS) --> Faster Retrieval

# Select (statement/query) on RDBMS - RETRIEVES (Index)

# EMBEDDINGS + FAISS --> Index
# Retrive - fetch top -k relevant chunks (splits)
# Prompt

In [22]:
# Example - basic - RAG implementation

In [27]:
# Example 5 - GPT Model Plug In

# ============================================================
# Tiny RAG: GPT answers from FAISS-retrieved context (Colab)
# ============================================================

# 0) Install packages
!pip install -q langchain-openai langchain-community faiss-cpu langchain-text-splitters

# 1) Imports & API key
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from google.colab import userdata
import os

api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError(
        "❌ OPENAI_API_KEY not found. Set once with:\n"
        "from google.colab import userdata\n"
        "userdata.set('OPENAI_API_KEY', 'sk-...')"
    )
os.environ["OPENAI_API_KEY"] = api_key

# 2) Paste your own source text (or press Enter to use defaults)
raw = input(
    "Paste your reference text (or press Enter to use a default cricket sample):\n"
).strip()

if not raw:
    raw = """
Mahendra Singh Dhoni (MSD) captained India and won the 2007 T20 World Cup and the 2011 ODI World Cup.
Sachin Ramesh Tendulkar (SRT) is widely called the 'God of Cricket' for his batting records.
Amitabh Bachchan, nicknamed Big B, is a legendary actor in Indian cinema.
Shah Rukh Khan, often called King Khan, is one of the most popular Bollywood actors worldwide.
Virat Kohli is renowned for batting consistency across formats.
Rohit Sharma is known for explosive batting and captaincy in limited-overs cricket.
"""

# 3) Split into chunks (simple defaults work fine to start)
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=60)
docs = splitter.create_documents([raw])

# 4) Build vector store + retriever
emb = OpenAIEmbeddings(model="text-embedding-3-small")   # fast & good
vs = FAISS.from_documents(docs, embedding=emb)
retriever = vs.as_retriever(search_kwargs={"k": 3})

# 5) Define the RAG prompt
prompt = ChatPromptTemplate.from_template(
    "You are a helpful assistant. Use ONLY the context to answer.\n\n"
    "Context:\n{context}\n\n"
    "Question: {question}\n"
    "If the answer is not in the context, say you don't know.\n"
    "Answer:"
)

# 6) Helper to format retrieved docs
def format_docs(docs):
    return "\n\n".join(d.page_content for d in docs)

# 7) LLM + chain
llm = ChatOpenAI(model="gpt-4.1-nano", temperature=0.2)
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 8) Interactive Q&A
print("\n💬 Ask questions about your text. Type 'exit' to quit.")
while True:
    q = input("Q: ").strip()
    if q.lower() in ["exit", "quit"]:
        print("👋 Goodbye!")
        break

    # Optional: peek at retrieved chunks (uncomment to see)
    # retrieved = retriever.get_relevant_documents(q)
    # print("\n[DEBUG] Retrieved context:\n", format_docs(retrieved), "\n")

    ans = rag_chain.invoke(q)
    print("A:", ans, "\n")


Paste your reference text (or press Enter to use a default cricket sample):
Data engineering with Databricks Databricks provides Lakeflow, an end-to-end data engineering solution that empowers data engineers, software developers, SQL developers, analysts, and data scientists to deliver high-quality data for downstream analytics, AI, and operational applications. Lakeflow is a unified solution for ingestion, transformation, and orchestration of your data, and includes Lakeflow Connect, Lakeflow Declarative Pipelines, and Lakeflow Jobs.  Lakeflow Connect Lakeflow Connect simplifies data ingestion with connectors to popular enterprise applications, databases, cloud storage, message buses, and local files. See Lakeflow Connect.  Feature  Description  Managed connectors  Managed connectors provide a simple UI and a configuration-based ingestion service with minimum operational overhead, without requiring you to use the underlying Lakeflow Declarative Pipelines APIs and infrastructure.  Stan

In [28]:
# 1. Document Loaders 2. Splitters 3. Embeddings & Stored into Vector Store/Vector Databses (Chroma DB) 4. LLM

In [24]:
# 1 Text File / PDF / CSV / Web Based Application

In [29]:
# 2. Text Splitter (Huge Text)

  # 1. Length Based 2. Text Strutured Based 3. Doc Structure Bases 4. Semantic Meaning Based

In [31]:
# End of the Notebook