In [1]:
# Install required libraries
!pip install transformers faiss-cpu sentence-transformers torch beautifulsoup4 requests

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.meta

In [2]:
# Imports
import requests
from bs4 import BeautifulSoup
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [3]:
# Data Retrieval and Preprocessing
url = "https://git-scm.com/docs/git#_git_commands"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
# Extract Git commands and their descriptions
commands = []
for section in soup.find_all('div', class_='sect2'):
    command = section.find('h3')
    if command:
        command_name = command.text.strip()
        description = section.find('p')
        if description:
            commands.append(f"{command_name}: {description.text.strip()}")

In [5]:
# Clean and preprocess the text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

documents = [preprocess_text(cmd) for cmd in commands]

In [6]:
# Implementing the Retriever
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for the documents
embeddings = model.encode(documents)

# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings.astype('float32'))

def retrieve(query, k=3):
    query_vector = model.encode([query])
    _, indices = index.search(query_vector.astype('float32'), k)
    return [documents[i] for i in indices[0]]

# Implementing the Generator
tokenizer = AutoTokenizer.from_pretrained("gpt2")
generator = AutoModelForCausalLM.from_pretrained("gpt2")

def generate_response(query, context):
    prompt = f"Context:\n{' '.join(context)}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt")

    output = generator.generate(
        **inputs,
        max_length=150,
        num_return_sequences=1,
        temperature=0.7
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# RAG Function
def rag_response(query):
    retrieved_docs = retrieve(query)
    response = generate_response(query, retrieved_docs)
    return response

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
# Example usage
query = "How do I create a new Git repository?"
print(rag_response(query))

# Interactive loop for multiple queries
while True:
    user_query = input("Enter your Git-related question (or 'quit' to exit): ")
    if user_query.lower() == 'quit':
        break
    print(rag_response(user_query))
    print("\n")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context:
Syncing repositories: A really simple server for Git repositories Reset, restore and revert: There are three commands with similar names: git reset, git restore and git revert. Git Commits: The human-readable name used in the author identity when creating commit or tag objects, or when writing reflogs. Overrides the user.name and author.name configuration settings.

Question: How do I create a new Git repository?
Answer: Create a new Git repository using the following command: git clone git://github.com/jamesj/git-commits.git

Note: The following commands are not supported by Git.

git commit -m "Done"

git push -
Enter your Git-related question (or 'quit' to exit): What does git-revert[1] do?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context:
Reset, restore and revert: There are three commands with similar names: git reset, git restore and git revert. other: A number controlling the amount of output shown by the recursive merge strategy. Overrides merge.verbosity. See git-merge[1] Git Diffs: Only valid setting is "--unified=??" or "-u??" to set the number of context lines shown when a unified diff is created. This takes precedence over any "-U" or "--unified" option value passed on the Git diff command line.

Question: What does git-revert[1] do?
Answer: It is a command that is used to revert a commit to the master


Enter your Git-related question (or 'quit' to exit): What does git-rebase[1] do?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context:
other: A number controlling the amount of output shown by the recursive merge strategy. Overrides merge.verbosity. See git-merge[1] Reset, restore and revert: There are three commands with similar names: git reset, git restore and git revert. Git Commits: The human-readable name used in the author identity when creating commit or tag objects, or when writing reflogs. Overrides the user.name and author.name configuration settings.

Question: What does git-rebase[1] do?
Answer: git-rebase[1] is a command that creates a new commit object. It is used to create a new commit object.

Question: What


Enter your Git-related question (or 'quit' to exit): quit
