In [None]:
from langchain.document_loaders import GitLoader

loader = GitLoader(
    clone_url="https://github.com/prnv007-rgb/CareerKraft",
    repo_path="./CareerKraft",
    branch="main",
    file_filter=lambda file_path: file_path.endswith(".py")  # optional
)
docs = loader.load()


In [None]:
import os

def load_repo_texts(repo_path):
    texts = []
    for root, dirs, files in os.walk(repo_path):
        for file in files:
            if file.endswith(('.py', '.js', '.ts', '.md', '.html', '.json')):  # filter as needed
                filepath = os.path.join(root, file)
                with open(filepath, 'r', encoding='utf-8') as f:
                    texts.append(f.read())
    return texts

repo_texts = load_repo_texts('./CareerKraft')

# Then chunk, embed, and store in FAISS or your RAG vector DB.


In [2]:
import os
from pathlib import Path

from langchain_community.document_loaders import GitLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings   # or replace with your favorite
from langchain.vectorstores import FAISS

def build_rag_index(
    repo_url: str,
    local_path: str = "./repos",
    branch: str = "main",
    index_path: str = "./faiss_index",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
):
    # 1️⃣ Clone & load all files as Documents (adds file_path metadata)
    loader = GitLoader(
        clone_url=repo_url,
        repo_path=os.path.join(local_path, Path(repo_url).stem),
        branch=branch,
        # optional filter: only code files
        file_filter=lambda f: f.endswith((".py", ".js", ".ts", ".java", ".go", ".html", ".css")),
    )
    docs = loader.load()
    print(f"🗂️  Loaded {len(docs)} source files from {repo_url}")

    # 2️⃣ Chunk long files for better retrieval granularity
    splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    chunks = splitter.split_documents(docs)
    print(f"✂️  Split into {len(chunks)} chunks (size={chunk_size}, overlap={chunk_overlap})")

    # 3️⃣ Create embeddings
    embedder = OllamaEmbeddings(model="mxbai-embed-large")  
    # — or replace with: 
    # from langchain.embeddings import HuggingFaceEmbeddings
    # embedder = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

    # 4️⃣ Build FAISS index (automatically embeds and stores metadata)
    vectorstore = FAISS.from_documents(chunks, embedding=embedder)
    vectorstore.save_local(index_path)
    print(f"✅ FAISS index saved to: {index_path}")

if __name__ == "__main__":
    build_rag_index(
        repo_url="https://github.com/prnv007-rgb/Book-Recommendations-.git",
        local_path="./repos",
        branch="main",
        index_path="./faiss_index/careerkraft",
        chunk_size=800,
        chunk_overlap=100,
    )


🗂️  Loaded 3 source files from https://github.com/prnv007-rgb/Book-Recommendations-.git
✂️  Split into 2 chunks (size=800, overlap=100)
✅ FAISS index saved to: ./faiss_index/careerkraft


In [4]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings

embedder = OllamaEmbeddings(model="mxbai-embed-large")
vectorstore = FAISS.load_local(
    "./faiss_index/careerkraft",
    embeddings=embedder,
    allow_dangerous_deserialization=True
)


In [5]:
retriever = vectorstore.as_retriever(search_type="similarity", k=5)


In [6]:
question = "How are books recommended in this project?"

docs = retriever.get_relevant_documents(question)
context = "\n\n".join([doc.page_content for doc in docs])


  docs = retriever.get_relevant_documents(question)


In [7]:
import requests

def ask_llm_with_context(question, context):
    prompt = f"""
You are a helpful assistant with knowledge of the following GitHub repo.

Context:
{context}

Based on the context above, answer this question:
{question}
"""

    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": "llama3", "prompt": prompt, "stream": False}
    )
    return response.json()["response"]

answer = ask_llm_with_context(question, context)
print("🤖 Answer:", answer)


🤖 Answer: According to the code, books are recommended based on a collaborative filtering approach using the `model.kneighbors` method, which takes into account the ratings and preferences of users who have similar tastes.

Here's how it works:

1. The user selects a book as input.
2. The model uses this book's characteristics (e.g., genre, author, etc.) to find similar books by computing the k-nearest neighbors using `model.kneighbors`.
3. The function `rec_book` takes the selected book and returns a list of recommended books and their corresponding poster URLs.

The recommendation is based on the idea that if a user liked a particular book, they may also like other books with similar characteristics. The model uses this principle to generate recommendations by finding the most similar books to the input book.
