In [None]:
# Clone the RAG repository from GitHub
!git clone https://github.com/sathishkumar67/RAG.git

# Move all files from the cloned RAG folder to the working directory (specific to Kaggle environment)
!mv /kaggle/working/RAG/* /kaggle/working/

# Upgrade pip to the latest version for compatibility
!pip install --upgrade pip

# Install all required Python packages listed in requirements.txt
!pip install -r requirements.txt # --upgrade --upgrade-strategy eager

Cloning into 'RAG'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (8/8), done.[K
Receiving objects: 100% (9/9), 4.27 KiB | 2.14 MiB/s, done.
remote: Total 9 (delta 1), reused 4 (delta 0), pack-reused 0 (from 0)[K
Resolving deltas: 100% (1/1), done.


In [2]:
# PDF and image processing libraries
import fitz  # PyMuPDF
from PIL import Image
import io
import base64

# Core Python libraries
import os
import numpy as np

# Machine learning and deep learning libraries
import torch
from transformers import CLIPProcessor, CLIPModel

# LangChain for LLM and document handling
from langchain_core.documents import Document
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Vector store and similarity search
from langchain_community.vectorstores import FAISS
from sklearn.metrics.pairwise import cosine_similarity

2025-08-12 14:01:16.812310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755007276.837043     292 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755007276.844713     292 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755007276.864880     292 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755007276.864921     292 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755007276.864924     292 computation_placer.cc:177] computation placer alr

In [None]:
API_KEY = ""

In [11]:
# Initialize CLIP model and processor
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # using fast processor
clip_model.eval()
print("CLIP model initialized")

CLIP model initialized


In [12]:
# functions
def embed_text(text: str):
    """Embed text using CLIP model."""
    inputs = clip_processor(
        text=text, 
        return_tensors="pt", 
        padding=True,
        truncation=True,
        max_length=77  # CLIP's max token length
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [13]:
# Process PDF
pdf_path = "/kaggle/input/docs-rag/SATHISH KUMAR RESUME.pdf"
doc=fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Store actual image data for LLM

# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [14]:
for i,page in enumerate(doc):
    ## process text
    text=page.get_text()
    if text.strip():
        ##create temporary document for splitting
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])

        #Embed each chunk using CLIP
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)

In [18]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)

# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)

In [19]:
def retrieve_text(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)
    
    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    
    return results

In [29]:
# iterate through retrieve_text("user name") and add that to a empty string
context = ""
for result in retrieve_text("user name"):
    context += result.page_content

In [33]:
from groq import Groq

client = Groq(api_key=API_KEY)
completion = client.chat.completions.create(
    model="openai/gpt-oss-20b",
    messages=[
      {
        "role": "assistant",
        "content": f"This is my context. {context}"
      },
      {
        "role": "user",
        "content": "what skills does he have?"
      }
    ],
    temperature=1,
    max_completion_tokens=8192,
    top_p=1,
    reasoning_effort="medium",
    stream=True,
    stop=None
)

for chunk in completion:
    print(chunk.choices[0].delta.content or "", end="")


**Key Skills & Expertise**

| Domain | Specific Skills / Tools |
|-------|------------------------|
| **Programming Languages** | Python, C/C++, Java, R |
| **Deep Learning / ML Frameworks** | PyTorch, TensorFlow, Keras, JAX, Triton |
| **Computer Vision** | YOLOv11, YOLOv4, SSDLite, YOLOv4, YOLOv11xl, YOLOv11xl detector, pseudo‑labelling pipelines |
| **Natural Language Processing (NLP)** | LLM pre‑training (PyTorch DDP, CUDA), 10 B‑token FineWeb‑Edu dataset, 3‑phase training, checkpointing |
| **Data Science & Analytics** | Scikit‑Learn, Pandas, NumPy, data augmentation, BOHB hyper‑parameter tuning, Bayesian optimisation, HyperBand |
| **Tools & Platforms** | Git, Weights & Biases, Tableau, Triton, GPU cluster orchestration (DDP), CUDA, DDP, Python scripts |
| **Mathematical Foundations** | Linear Algebra, Probability & Statistics, Calculus |
| **Databases / Cloud** | MySQL, MongoDB, AWS (cloud infrastructure) |
| **Model Evaluation & Optimization** | mAP@[0.50:0.95] metrics, mAP imp