--–Install dependendies

In [None]:
!pip install -qU langchain langchain-community langchain-chroma
!pip install -qU langchain-text-splitters
!pip install -qU sentence-transformers
!pip install -qU transformers accelerate sentencepiece


---Load Git/GitHub docs (Data collection)

In [1]:
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://git-scm.com/docs/gittutorial",  # Git official tutorial
]

loader = WebBaseLoader(urls)
raw_docs = loader.load()

print("Number of documents:", len(raw_docs))
print(raw_docs[0].metadata)
print(raw_docs[0].page_content[:500])


USER_AGENT environment variable not set, consider setting it to identify your requests.


Number of documents: 1
{'source': 'https://git-scm.com/docs/gittutorial', 'title': 'Git - gittutorial Documentation', 'language': 'en'}







Git - gittutorial Documentation


























About


Trademark




Learn


Book


Cheat Sheet


Videos


External Links




Tools


Command Line


GUIs


Hosting




Reference


Install


Community





 Table of Contents
        
NAME 
SYNOPSIS 
DESCRIPTION 
Importing a new project 
Making changes 
Git tracks content not files 
Viewing project history 
Managing branches 
Using Git for collaboration 
Exploring history 
Next Steps 
SEE ALSO 
GIT 







 English ▾

Localized v


---Pre-processing

In [2]:
def clean_text(text: str) -> str:
    text = text.replace("\n", " ").strip()
    while "  " in text:
        text = text.replace("  ", " ")
    return text

for doc in raw_docs:
    doc.page_content = clean_text(doc.page_content)

print(raw_docs[0].page_content[:300])


Git - gittutorial Documentation About Trademark Learn Book Cheat Sheet Videos External Links Tools Command Line GUIs Hosting Reference Install Community Table of Contents NAME SYNOPSIS DESCRIPTION Importing a new project Making changes Git tracks content not files Viewing project history Managing br


---Chunk the documents

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,      # size of each chunk in characters
    chunk_overlap=200,   # overlap to keep context
)

chunks = text_splitter.split_documents(raw_docs)

print("Total chunks:", len(chunks))
print("\nSample chunk:\n")
print(chunks[0].page_content[:400])


Total chunks: 32

Sample chunk:

Git - gittutorial Documentation About Trademark Learn Book Cheat Sheet Videos External Links Tools Command Line GUIs Hosting Reference Install Community Table of Contents NAME SYNOPSIS DESCRIPTION Importing a new project Making changes Git tracks content not files Viewing project history Managing branches Using Git for collaboration Exploring history Next Steps SEE ALSO GIT English ▾ Localized ver


---Embeddings (meaning → numbers)

In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# quick test
test_vec = embedding_model.embed_query("What is git init?")
print("Embedding length:", len(test_vec))
print("First 5 values:", test_vec[:5])


  embedding_model = HuggingFaceEmbeddings(


Embedding length: 384
First 5 values: [-0.07344470918178558, -0.02956530638039112, -0.043813176453113556, 0.020704641938209534, 0.05633087456226349]


---Vector DB (Chroma) + basic search

In [5]:
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    collection_name="git_github_docs"
)

print("✅ Vector store created!")

# quick search test
query = "How do I create a new git repository?"
docs_retrieved = vectorstore.similarity_search(query, k=3)

print("\nRetrieved", len(docs_retrieved), "chunks\n")
print(docs_retrieved[0].page_content[:400])


✅ Vector store created!

Retrieved 3 chunks

Comes Here" $ git config --global user.email you@yourdomain.example.com Importing a new project Assume you have a tarball project.tar.gz with your initial work. You can place it under Git revision control as follows. $ tar xzf project.tar.gz $ cd project $ git init Git will reply Initialized empty Git repository in .git/ You’ve now initialized the working directory—​you may notice a new directory 


---Retriever (nice wrapper over vector DB)

In [6]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

test_docs = retriever.invoke("What does git init do?")
print("Retriever returned", len(test_docs), "docs\n")
print(test_docs[0].page_content[:400])


Retriever returned 3 docs

Comes Here" $ git config --global user.email you@yourdomain.example.com Importing a new project Assume you have a tarball project.tar.gz with your initial work. You can place it under Git revision control as follows. $ tar xzf project.tar.gz $ cd project $ git init Git will reply Initialized empty Git repository in .git/ You’ve now initialized the working directory—​you may notice a new directory 


---Load free LLM (TinyLlama)

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading model (this may take a bit)...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"   # use GPU if Colab has it, else CPU
)

llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256
)

print("✅ TinyLlama loaded!")


Loading tokenizer...
Loading model (this may take a bit)...


Some parameters are on the meta device because they were offloaded to the cpu and disk.
Device set to use cpu


✅ TinyLlama loaded!


---RAG function: Retrieve + Augment + Generate

In [8]:
def format_docs(docs):
    # Join all chunk texts with separators
    return "\n\n---\n\n".join(d.page_content for d in docs)

system_instruction = (
    "You are an assistant that answers questions about Git basics "
    "using ONLY the context provided. "
    "If the answer is not in the context, say you are not sure. "
    "Give short, clear answers in 3–6 sentences. "
    "Do not ask new questions or start new Q&A blocks."
)

def rag_answer(question: str) -> str:
    # 1) Retrieve relevant chunks
    docs = retriever.invoke(question)
    context = format_docs(docs)

    # 2) Build prompt for TinyLlama
    prompt = (
        f"{system_instruction}\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\n"
        f"Answer:"
    )

    # 3) Generate answer
    outputs = llm_pipeline(prompt)[0]["generated_text"]

    # 4) Keep only part after "Answer:"
    text = outputs.split("Answer:", 1)[-1].strip()

    # If model starts adding another "Question:", cut it off
    if "Question:" in text:
        text = text.split("Question:", 1)[0].strip()

    return text

print("✅ RAG function defined.")


✅ RAG function defined.


---– Test Q&A

In [12]:
#print("Q: What does git init do?")
#print("A:", rag_answer("What does git init do?"))

#print("\n" + "="*80 + "\n")

#print("Q: What is a git commit?")
#print("A:", rag_answer("What is a git commit?"))

#print("\n" + "="*80 + "\n")

print("Q: What is a git hub?")
print("A:", rag_answer("What is a git hub?"))



Q: What is a git hub?


KeyboardInterrupt: 