<a href="https://colab.research.google.com/github/saadkhi/Side/blob/main/scripts_ML_Model/Script2_for_fintune_w_LLaMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Packages & Libraries**

In [None]:
!pip install faiss-cpu
!pip install sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


# **Dataset Info and finrtuning**

In [None]:
import json

docs = []
with open("dataset.jsonl", "r") as f:
    for line in f:
        row = json.loads(line)
        text = f"Q: {row['prompt']}\nA: {row['completion']}"
        docs.append(text)

print(len(docs))

FileNotFoundError: [Errno 2] No such file or directory: 'dataset.jsonl'

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")  # lightweight & fast

doc_embeddings = embedder.encode(docs, convert_to_numpy=True, show_progress_bar=True)

# Build FAISS index
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

print("Index size:", index.ntotal)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_name = "microsoft/phi-3-mini-4k-instruct"  # or llama2-chat if you have GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
def is_database_query(query: str) -> bool:
    keywords = ["sql", "nosql", "database", "query", "table", "index", "join"]
    return any(k in query.lower() for k in keywords)

In [None]:
def chatbot(query):
    # Step 1: Check domain relevance
    if not is_database_query(query):
        return "I only answer database-related queries."

    # Step 2: Retrieve top matches
    q_embedding = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(q_embedding, k=3)  # top 3

    # Step 3: Check threshold
    if D[0][0] > 0.6:
        context = "\n".join([docs[i] for i in I[0]])
    else:
        context = ""

    # Step 4: Build prompt
    prompt = f"""
    You are a SQL/NoSQL expert.
    Context: {context if context else "No context found."}
    Question: {query}

    Answer:
    """

    # Run through model
    raw_output = llm(prompt, max_new_tokens=300, do_sample=True)[0]['generated_text']

    # Extract only the part after "Answer:"
    if "Answer:" in raw_output:
        answer = raw_output.split("Answer:", 1)[1].strip()
    else:
        answer = raw_output.strip()

    return answer


In [None]:
print("💬 Database Chatbot is ready! (type 'exit' to quit)\n")

while True:
    query = input("You: ").strip()
    if query.lower() in ["exit", "quit", "bye"]:
        print("Bot: Goodbye! 👋")
        break

    response = chatbot(query)
    print(f"Bot: {response}\n")