In [0]:
%pip install langchain langchain-text-splitters sentence-transformers faiss-cpu

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# If using local Jupyter, uncomment the next line. In Databricks, 'spark' exists automatically.
spark = SparkSession.builder.appName("RAG_Project").getOrCreate()

data = [
    (101, "Commercial Credit Policy", "Section 404: LTV Limits. For Commercial Real Estate (CRE), the maximum Loan-to-Value (LTV) ratio is 75%. Exception: If the property is owner-occupied, LTV can go up to 80% with CRO approval."),
    (102, "Agri-Loan Policy", "Section 202: Seasonal Crops. Loans for seasonal crops must be repaid within 12 months. High-risk crops (e.g., chili, vanilla) require crop insurance mandatory for limits above $50k."),
    (103, "Retail Housing Policy", "Section 105: Income Verification. For salaried employees, Form 16 is mandatory. For self-employed, last 3 years ITR is required. Minimum credit score for unsecured loans is 750.")
]

schema = StructType([
    StructField("doc_id", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("full_text", StringType(), True)
])

df_bronze = spark.createDataFrame(data, schema)
display(df_bronze)

In [0]:
from pyspark.sql.functions import udf, explode, col
from pyspark.sql.types import ArrayType, StringType
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 1. Define the splitting logic (Python function)
def splitter_func(text):
    if not text:
        return []
    
    # "Chunk Size 50" is small for demo; in production use 500-1000
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=50, 
        chunk_overlap=10,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    return text_splitter.split_text(text)

# 2. Register as a Spark UDF (User Defined Function)
# We return an Array of Strings because one document becomes MANY chunks
chunk_udf = udf(splitter_func, ArrayType(StringType()))

# 3. Apply UDF and "Explode" (Flatten) the results
df_silver_chunks = df_bronze.withColumn("chunk", explode(chunk_udf(col("full_text")))) \
                            .select("doc_id", "title", "chunk")

display(df_silver_chunks)

In [0]:
df_silver_chunks.show(vertical=True, truncate= False)

In [0]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# 1. Convert our Spark Chunked Data to Pandas (for local model processing)
# In production with 1B rows, you wouldn't do this! You'd use mapPartitions.
pdf = df_silver_chunks.toPandas()

# 2. Load the "Brain" (The Embedding Model)
# This downloads a small model (~80MB) that turns text into 384 numbers.
print("Loading model... this might take a minute...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# 3. Generate Vectors
# We apply the model to every text chunk.
print("Generating vectors...")
pdf['vector'] = pdf['chunk'].apply(lambda x: model.encode(x))

# 4. Display the "Magic"
# Look at the 'vector' column. It's not text anymore; it's a list of numbers.
print("Done!")
pdf.head(3)


In [0]:
import faiss
import numpy as np

# 1. Prepare Data for FAISS
# FAISS expects a Matrix of Float32 numbers.
# We stack our list of vectors into a single numpy array.
embeddings_matrix = np.stack(pdf['vector'].values).astype('float32')

# 2. Create the Index
# "d" is the dimension of the vector (384 for MiniLM)
d = embeddings_matrix.shape[1]
index = faiss.IndexFlatL2(d)  # L2 = Euclidean Distance (standard for simple search)

# 3. Add vectors to the index
index.add(embeddings_matrix)
print(f"Number of documents in index: {index.ntotal}")

# --- THE MOMENT OF TRUTH ---

# 4. Define a User Question
query_text = "What is the max LTV for commercial real estate?"

# 5. Convert Question to Vector (using the same brain/model)
query_vector = model.encode([query_text]).astype('float32')

# 6. Search the Index
# k=2 means "Give me the top 2 closest matches"
distances, indices = index.search(query_vector, k=2)

# 7. Display Results
print(f"\nQuery: '{query_text}'\n")
print("--- Top Retrieved Results ---")

for i, doc_index in enumerate(indices[0]):
    # Get the row from our original Pandas dataframe using the index found by FAISS
    result_row = pdf.iloc[doc_index]
    print(f"Result {i+1} (Score: {distances[0][i]:.4f}):")
    print(f"Doc Title: {result_row['title']}")
    print(f"Text Chunk: {result_row['chunk']}")
    print("-" * 50)