In [2]:
from langchain.document_loaders import GitLoader

loader = GitLoader(
    clone_url="https://github.com/rudrajikadra/Movie-Recommendation-System-Using-Python-and-Pandas.git",
    repo_path="./Movie",
    branch="master",
    file_filter=lambda file_path: file_path.endswith(".py")  # optional
)
docs = loader.load()


In [4]:
import os

def load_repo_texts(repo_path):
    texts = []
    for root, dirs, files in os.walk(repo_path):
        for file in files:
            if file.endswith(('.py', '.js', '.ts', '.md', '.html', '.json')):  # filter as needed
                filepath = os.path.join(root, file)
                with open(filepath, 'r', encoding='utf-8') as f:
                    texts.append(f.read())
    return texts

repo_texts = load_repo_texts('./Movie')

# Then chunk, embed, and store in FAISS or your RAG vector DB.


In [8]:
import os
from pathlib import Path

from langchain_community.document_loaders import GitLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings   # or replace with your favorite
from langchain.vectorstores import FAISS

def build_rag_index(
    repo_url: str,
    local_path: str = "./repos",
    branch: str = "master",
    index_path: str = "./faiss_index",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
):
    # 1️⃣ Clone & load all files as Documents (adds file_path metadata)
    loader = GitLoader(
    clone_url=repo_url,
    repo_path=os.path.join(local_path, Path(repo_url).stem),
    branch=branch,
    file_filter=lambda f: True  # ← TEMP: load all files
)

    docs = loader.load()
    print(f"🗂️  Loaded {len(docs)} source files from {repo_url}")

    # 2️⃣ Chunk long files for better retrieval granularity
    splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    chunks = splitter.split_documents(docs)
    print(f"✂️  Split into {len(chunks)} chunks (size={chunk_size}, overlap={chunk_overlap})")

    # 3️⃣ Create embeddings
    embedder = OllamaEmbeddings(model="mxbai-embed-large")  
    # — or replace with: 
    # from langchain.embeddings import HuggingFaceEmbeddings
    # embedder = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

    # 4️⃣ Build FAISS index (automatically embeds and stores metadata)
    vectorstore = FAISS.from_documents(chunks, embedding=embedder)
    vectorstore.save_local(index_path)
    print(f"✅ FAISS index saved to: {index_path}")

if __name__ == "__main__":
    build_rag_index(
        repo_url="https://github.com/rudrajikadra/Movie-Recommendation-System-Using-Python-and-Pandas.git",
        local_path="./repos",
        branch="master",
        index_path="./faiss_index/movies",
        chunk_size=800,
        chunk_overlap=100,
    )


🗂️  Loaded 6 source files from https://github.com/rudrajikadra/Movie-Recommendation-System-Using-Python-and-Pandas.git
✂️  Split into 6 chunks (size=800, overlap=100)
✅ FAISS index saved to: ./faiss_index/movies


In [10]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings

embedder = OllamaEmbeddings(model="mxbai-embed-large")
vectorstore = FAISS.load_local(
    "./faiss_index/movies",
    embeddings=embedder,
    allow_dangerous_deserialization=True
)


In [11]:
retriever = vectorstore.as_retriever(search_type="similarity", k=5)


In [None]:
question = "How are books recommended in this project?"

docs = retriever.get_relevant_documents(question)
context = "\n\n".join([doc.page_content for doc in docs])


  docs = retriever.get_relevant_documents(question)


In [13]:
import requests

def ask_llm_with_context(question, context):
    prompt = f"""
You are a helpful assistant with knowledge of the following GitHub repo.

Context:
{context}

Based on the context above, answer this question:
{question}
"""

    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": "llama3", "prompt": prompt, "stream": False}
    )
    return response.json()["response"]

answer = ask_llm_with_context(question, context)
print("🤖 Answer:", answer)


🤖 Answer: I'm happy to help! Based on the provided GitHub repo context, I can tell you that it appears to be a movie recommendation system.

The system seems to contain a collection of movies with their respective ratings and other relevant information. The data is likely in a format such as CSV or JSON, which allows for easy manipulation and querying.

To answer your question, here's how the system works:

1. **Data Collection**: A large dataset of movies, including their titles, genres, ratings, and other relevant information, was collected.
2. **Data Preprocessing**: The data was preprocessed to ensure consistency and remove any errors or duplicate entries.
3. **Recommendation Algorithm**: An algorithm was developed to analyze the user's preferences and generate personalized movie recommendations based on their viewing history and ratings.
4. **User Interaction**: Users interact with the system by searching for movies, browsing through categories, or getting personalized recommendat