In [1]:
import os
import json
import re

def extract_fields_from_py(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    title = description = time = space = ""
    tags = []
    code_lines = []

    for line in lines:
        if line.strip().startswith("#"):
            line = line.strip()[1:].strip()
            if line.lower().startswith("title:"):
                title = line.split(":", 1)[1].strip()
            elif line.lower().startswith("description:"):
                description = line.split(":", 1)[1].strip()
            elif line.lower().startswith("time complexity:"):
                time = line.split(":", 1)[1].strip()
            elif line.lower().startswith("space complexity:"):
                space = line.split(":", 1)[1].strip()
            elif line.lower().startswith("tags:"):
                tags = [tag.strip() for tag in line.split(":", 1)[1].split(",")]
        else:
            code_lines.append(line)

    return {
        "title": title,
        "description": description,
        "optimal_code": "".join(code_lines).strip(),
        "time_complexity": time,
        "space_complexity": space,
        "tags": tags
    }

def convert_all_py_to_json(directory, output_path="neetcode_problems.json"):
    problems = []
    for filename in os.listdir(directory):
        if filename.endswith(".py"):
            full_path = os.path.join(directory, filename)
            problem = extract_fields_from_py(full_path)
            problems.append(problem)

    with open('\out/', "w", encoding="utf-8") as out:
        json.dump(problems, out, indent=2)

    print(f"✅ Saved {len(problems)} problems to {output_path}")

# Run this script
if __name__ == "__main__":
    convert_all_py_to_json("npy/leetcode/python") 


  with open('\out/', "w", encoding="utf-8") as out:
  with open('\out/', "w", encoding="utf-8") as out:


OSError: [Errno 22] Invalid argument: '\\out/'

In [3]:
import os
import json

def extract_fields_from_py(filepath):
    title = description = time = space = ""
    tags = []
    code_lines = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for raw in f:
            line = raw.rstrip('\n')
            if line.lstrip().startswith('#'):
                meta = line.lstrip()[1:].strip()
                if meta.lower().startswith('title:'):
                    title = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('description:'):
                    description = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('time complexity:'):
                    time = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('space complexity:'):
                    space = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('tags:'):
                    tags = [t.strip() for t in meta.split(':', 1)[1].split(',')]
            else:
                code_lines.append(raw)

    return {
        "title": title,
        "description": description,
        "optimal_code": "".join(code_lines).rstrip(),
        "time_complexity": time,
        "space_complexity": space,
        "tags": tags
    }

def convert_all_py_to_json(py_dir, output_path):
    if not os.path.isdir(py_dir):
        print(f"❌ Source directory not found: {py_dir}")
        return

    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)

    problems = []
    for fname in sorted(os.listdir(py_dir)):
        if not fname.lower().endswith('.py'):
            continue
        full = os.path.join(py_dir, fname)
        try:
            entry = extract_fields_from_py(full)
            if entry['title'] and entry['optimal_code']:
                problems.append(entry)
            else:
                print(f"⚠️  Skipping {fname}: missing title or code")
        except Exception as e:
            print(f"❌ Error processing {fname}: {e}")

    with open(output_path, 'w', encoding='utf-8') as out:
        json.dump(problems, out, indent=2, ensure_ascii=False)

    print(f"✅ Converted {len(problems)} problems → {output_path}")



In [5]:
convert_all_py_to_json("npy/leetcode/python", "neetcode_problems.json")


⚠️  Skipping 0001-two-sum.py: missing title or code
⚠️  Skipping 0002-add-two-numbers.py: missing title or code
⚠️  Skipping 0003-longest-substring-without-repeating-characters.py: missing title or code
⚠️  Skipping 0004-median-of-two-sorted-arrays.py: missing title or code
⚠️  Skipping 0005-longest-palindromic-substring.py: missing title or code
⚠️  Skipping 0006-zigzag-conversion.py: missing title or code
⚠️  Skipping 0007-reverse-integer.py: missing title or code
⚠️  Skipping 0009-palindrome-number.py: missing title or code
⚠️  Skipping 0010-regular-expression-matching.py: missing title or code
⚠️  Skipping 0011-container-with-most-water.py: missing title or code
⚠️  Skipping 0012-integer-to-roman.py: missing title or code
⚠️  Skipping 0013-roman-to-integer.py: missing title or code
⚠️  Skipping 0014-longest-common-prefix.py: missing title or code
⚠️  Skipping 0015-3sum.py: missing title or code
⚠️  Skipping 0016-3sum-closest.py: missing title or code
⚠️  Skipping 0017-letter-combin

In [6]:
import os
import json
import re
import sys

def extract_fields_from_py(filepath):
    """
    Read a .py file and extract:
      - title, description, time/space complexities, tags from leading comments
      - the rest of the file as optimal_code
    Fallbacks:
      - Title inferred from filename if missing
      - description → "No description available."
      - complexities → "Unknown"
      - tags → []
    """
    title = description = time = space = ""
    tags = []
    code_lines = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for raw in f:
            line = raw.rstrip('\n')
            if line.lstrip().startswith('#'):
                meta = line.lstrip()[1:].strip()
                # metadata keys (case-insensitive)
                if meta.lower().startswith('title:'):
                    title = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('description:'):
                    description = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('time complexity:'):
                    time = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('space complexity:'):
                    space = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('tags:'):
                    tags = [t.strip() for t in meta.split(':', 1)[1].split(',')]
                # else: ignore other comments
            else:
                code_lines.append(raw)

    # Fallback: infer title from filename if not provided
    if not title:
        fname = os.path.basename(filepath)
        name = os.path.splitext(fname)[0]
        # strip any leading numbers and hyphens
        name = re.sub(r'^\d+[-_]*', '', name)
        title = name.replace('_', ' ').replace('-', ' ').title()

    return {
        "title": title,
        "description": description or "No description available.",
        "optimal_code": "".join(code_lines).rstrip(),
        "time_complexity": time or "Unknown",
        "space_complexity": space or "Unknown",
        "tags": tags
    }

def convert_all_py_to_json(src_dir, output_path):
    """
    Walk src_dir recursively, convert every .py file to a JSON entry,
    and write the list out to output_path.
    """
    if not os.path.isdir(src_dir):
        raise ValueError(f"Source directory not found: {src_dir}")

    # ensure the output directory exists
    out_dir = os.path.dirname(output_path)
    if out_dir:
        os.makedirs(out_dir, exist_ok=True)

    problems = []
    for root, _, files in os.walk(src_dir):
        for fname in sorted(files):
            if not fname.lower().endswith('.py') or fname == "__init__.py":
                continue
            full_path = os.path.join(root, fname)
            try:
                entry = extract_fields_from_py(full_path)
                # skip if code is empty
                if entry["optimal_code"].strip():
                    problems.append(entry)
                else:
                    print(f"⚠️  Skipping {full_path}: no code detected")
            except Exception as e:
                print(f"❌ Error processing {full_path}: {e}")

    # write JSON
    with open(output_path, 'w', encoding='utf-8') as out_f:
        json.dump(problems, out_f, indent=2, ensure_ascii=False)

    print(f"✅ Converted {len(problems)} problems → {output_path}")


# ----------------------
# If run as a script, accept command-line args
if __name__ == "__main__":
    if len(sys.argv) >= 3:
        src = sys.argv[1]
        dest = sys.argv[2]
    else:
        # defaults: adjust as needed
        src = "npy/leetcode/python"
        dest = "neetcode_problems.json"

    convert_all_py_to_json(src, dest)


✅ Converted 396 problems → neetcode_problems.json


In [9]:
import os
import json
import re
import sys

def infer_complexity_and_tags(code: str):
    """
    Infer time and space complexities + relevant tags from code heuristically.
    """
    tags = set()
    time_complexity = "Unknown"
    space_complexity = "Unknown"

    code_lower = code.lower()

    # Tag inference
    if "dict" in code_lower or "{}" in code_lower:
        tags.add("hash table")
        space_complexity = "O(n)"
    if "set" in code_lower:
        tags.add("hash table")
        space_complexity = "O(n)"
    if "while" in code_lower and "l < r" in code_lower:
        tags.update(["two pointers", "sliding window"])
        time_complexity = "O(n)"
    if "for" in code_lower and "range(len" in code_lower:
        tags.add("array")
    if "if len(" in code_lower and ">" in code_lower and "binary search" not in tags:
        tags.add("binary search")
        time_complexity = "O(log n)"
    if "listnode" in code_lower:
        tags.add("linked list")
    if "treenode" in code_lower:
        tags.add("tree")
    if "dfs" in code_lower or "recurse" in code_lower:
        tags.update(["recursion", "dfs"])
    if "math." in code_lower or "%" in code_lower or "//" in code_lower:
        tags.add("math")
    if "dp" in code_lower or "[i][j]" in code_lower:
        tags.add("dynamic programming")

    # Time complexity estimation
    if time_complexity == "Unknown":
        if "for" in code_lower and "for" in code_lower.split("for", 1)[-1]:
            time_complexity = "O(n^2)"
        elif "for" in code_lower:
            time_complexity = "O(n)"
        elif "while" in code_lower and "l < r" in code_lower:
            time_complexity = "O(n)"
        elif "binary search" in tags:
            time_complexity = "O(log n)"

    if space_complexity == "Unknown":
        if "res =" in code_lower or "result =" in code_lower:
            space_complexity = "O(n)"
        elif "return" in code_lower:
            space_complexity = "O(1)"

    return time_complexity, space_complexity, sorted(tags)

def extract_fields_from_py(filepath):
    """
    Read a .py file and extract metadata + code.
    If complexity/tags are missing from comments, infer them heuristically.
    """
    title = description = time = space = ""
    tags = []
    code_lines = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for raw in f:
            line = raw.rstrip('\n')
            if line.lstrip().startswith('#'):
                meta = line.lstrip()[1:].strip()
                if meta.lower().startswith('title:'):
                    title = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('description:'):
                    description = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('time complexity:'):
                    time = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('space complexity:'):
                    space = meta.split(':', 1)[1].strip()
                elif meta.lower().startswith('tags:'):
                    tags = [t.strip() for t in meta.split(':', 1)[1].split(',')]
            else:
                code_lines.append(raw)

    if not title:
        fname = os.path.basename(filepath)
        name = os.path.splitext(fname)[0]
        name = re.sub(r'^\d+[-_]*', '', name)
        title = name.replace('_', ' ').replace('-', ' ').title()

    code = "".join(code_lines).rstrip()

    # Infer missing values
    inferred_time, inferred_space, inferred_tags = infer_complexity_and_tags(code)
    time = time or inferred_time
    space = space or inferred_space
    tags = tags or inferred_tags

    return {
        "title": title,
        "description": description or "No description available.",
        "optimal_code": code,
        "time_complexity": time,
        "space_complexity": space,
        "tags": tags
    }

def convert_all_py_to_json(src_dir, output_path):
    """
    Walk src_dir recursively, convert every .py file to a JSON entry,
    and write the list out to output_path.
    """
    if not os.path.isdir(src_dir):
        raise ValueError(f"Source directory not found: {src_dir}")

    out_dir = os.path.dirname(output_path)
    if out_dir:
        os.makedirs(out_dir, exist_ok=True)

    problems = []
    for root, _, files in os.walk(src_dir):
        for fname in sorted(files):
            if not fname.lower().endswith('.py') or fname == "__init__.py":
                continue
            full_path = os.path.join(root, fname)
            try:
                entry = extract_fields_from_py(full_path)
                if entry["optimal_code"].strip():
                    problems.append(entry)
                else:
                    print(f"⚠️  Skipping {full_path}: no code detected")
            except Exception as e:
                print(f"❌ Error processing {full_path}: {e}")

    with open(output_path, 'w', encoding='utf-8') as out_f:
        json.dump(problems, out_f, indent=2, ensure_ascii=False)

    print(f"✅ Converted {len(problems)} problems → {output_path}")

if __name__ == "__main__":
    if len(sys.argv) >= 3:
        src = sys.argv[1]
        dest = sys.argv[2]
    else:
        src = "npy/leetcode/python"
        dest = "neetcode_problems.json"

    convert_all_py_to_json(src, dest)


✅ Converted 396 problems → neetcode_problems.json


In [10]:
import json
import numpy as np
import faiss
import subprocess

# Load your dataset
with open("neetcode_problems.json", "r", encoding="utf-8") as f:
    problems = json.load(f)

# Prepare input texts (title + description + code)
texts = [f"{p['title']}. {p['description']} {p['optimal_code']}" for p in problems]

def get_embedding_ollama(text: str):
    prompt = {
        "model": "nomic-embed-text",
        "prompt": text
    }
    result = subprocess.run(
        ["C:/Users/prana/AppData/Local/Programs/Ollama/ollama.exe", "embeddings", "--model", "nomic-embed-text"],
        input=json.dumps(prompt),
        capture_output=True,
        text=True
    )
    response = json.loads(result.stdout)
    return np.array(response["embedding"], dtype="float32")

# Generate and collect all embeddings
embeddings = np.array([get_embedding_ollama(text) for text in texts], dtype="float32")

# Create and save FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
faiss.write_index(index, "rag_nomic_index.faiss")

# Save problems alongside index
with open("rag_nomic_problems.json", "w", encoding="utf-8") as f:
    json.dump(problems, f, indent=2)


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [11]:
import json
import numpy as np
import faiss
import subprocess
import time

# Load your dataset
with open("neetcode_problems.json", "r", encoding="utf-8") as f:
    problems = json.load(f)

# Prepare input texts (title + description + code)
texts = [f"{p['title']}. {p['description']} {p['optimal_code']}" for p in problems]

def get_embedding_ollama(text: str):
    prompt = {
        "model": "nomic-embed-text",
        "prompt": text
    }
    try:
        result = subprocess.run(
            ["C:/Users/prana/AppData/Local/Programs/Ollama/ollama.exe", "embeddings", "--model", "nomic-embed-text"],
            input=json.dumps(prompt),
            capture_output=True,
            text=True,
            timeout=20  # seconds
        )
        if result.returncode != 0:
            raise RuntimeError(f"Ollama failed: {result.stderr}")
        response = json.loads(result.stdout)
        return np.array(response["embedding"], dtype="float32")
    except Exception as e:
        print(f"❌ Error embedding text: {text[:50]}... \n{e}")
        return None

# Generate and collect all embeddings
valid_problems = []
all_embeddings = []

for i, text in enumerate(texts):
    print(f"🔎 Embedding problem {i + 1}/{len(texts)}: {problems[i]['title']}")
    emb = get_embedding_ollama(text)
    if emb is not None:
        all_embeddings.append(emb)
        valid_problems.append(problems[i])
    else:
        print(f"⚠️ Skipping: {problems[i]['title']}")
    time.sleep(0.1)  # Be kind to system load

embeddings = np.vstack(all_embeddings).astype("float32")

# Create and save FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
faiss.write_index(index, "rag_nomic_index.faiss")

# Save only the successfully embedded problems
with open("rag_nomic_problems.json", "w", encoding="utf-8") as f:
    json.dump(valid_problems, f, indent=2)

print(f"✅ Index built for {len(valid_problems)} problems and saved successfully.")


🔎 Embedding problem 1/396: Two Sum
❌ Error embedding text: Two Sum. No description available. class Solution:... 
Ollama failed: Error: unknown command "embeddings" for "ollama"

⚠️ Skipping: Two Sum
🔎 Embedding problem 2/396: Add Two Numbers
❌ Error embedding text: Add Two Numbers. No description available. class S... 
Ollama failed: Error: unknown command "embeddings" for "ollama"

⚠️ Skipping: Add Two Numbers
🔎 Embedding problem 3/396: Longest Substring Without Repeating Characters
❌ Error embedding text: Longest Substring Without Repeating Characters. No... 
Ollama failed: Error: unknown command "embeddings" for "ollama"

⚠️ Skipping: Longest Substring Without Repeating Characters
🔎 Embedding problem 4/396: Median Of Two Sorted Arrays
❌ Error embedding text: Median Of Two Sorted Arrays. No description availa... 
Ollama failed: Error: unknown command "embeddings" for "ollama"

⚠️ Skipping: Median Of Two Sorted Arrays
🔎 Embedding problem 5/396: Longest Palindromic Substring
❌ Error e

ValueError: need at least one array to concatenate

In [13]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

def build_faiss_index(json_path, faiss_index_path, updated_json_path):
    # Load problem JSON
    with open(json_path, "r", encoding="utf-8") as f:
        problems = json.load(f)

    texts = []
    valid_problems = []

    for p in problems:
        title = p.get("title", "Untitled")
        description = p.get("description", "")
        code = p.get("optimal_code", "").strip()
        
        if not code:
            print(f"⚠️ Skipping {title}: no code")
            continue
        
        # Construct text input
        full_text = f"{title}. {description} {code}".strip()
        texts.append(full_text)
        valid_problems.append(p)

    if not texts:
        print("❌ No valid problems to embed.")
        return

    print(f"🔍 Generating embeddings for {len(texts)} problems...")
    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

    # Build FAISS index
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    faiss.write_index(index, faiss_index_path)

    # Save the updated (valid) JSON
    with open(updated_json_path, "w", encoding="utf-8") as f:
        json.dump(valid_problems, f, indent=2, ensure_ascii=False)

    print(f"\n✅ FAISS index saved to {faiss_index_path}")
    print(f"📄 Filtered problem dataset saved to {updated_json_path}")

# --- Run this script ---
if __name__ == "__main__":
    build_faiss_index(
        json_path="neetcode_problems.json",
        faiss_index_path="rag_minilm_index.faiss",
        updated_json_path="rag_minilm_problems.json"
    )



🔍 Generating embeddings for 396 problems...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]


✅ FAISS index saved to rag_minilm_index.faiss
📄 Filtered problem dataset saved to rag_minilm_problems.json


In [14]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load model & data
model = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index("rag_minilm_index.faiss")

with open("rag_minilm_problems.json", "r", encoding="utf-8") as f:
    problems = json.load(f)

def retrieve_similar_problems(query, k=5):
    query_vec = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_vec, k)
    results = [problems[i] for i in indices[0]]
    return results


In [15]:
def build_prompt(retrieved, query):
    context = ""
    for p in retrieved:
        context += f"### {p['title']}\n"
        context += f"{p['description']}\n"
        context += f"Optimal Code:\n{p['optimal_code']}\n"
        context += f"Time: {p['time_complexity']} | Space: {p['space_complexity']}\n\n"

    prompt = f"""You are a coding assistant. Based on the following context, answer the user's question.

{context}
--- End of context ---

User question: {query}
Answer:"""
    return prompt


In [16]:
import subprocess

OLLAMA_PATH = "C:/Users/prana/AppData/Local/Programs/Ollama/ollama.exe"
MODEL_NAME = "llama3"

def generate_answer(prompt):
    result = subprocess.run(
        [OLLAMA_PATH, "run", MODEL_NAME],
        input=prompt,
        capture_output=True,
        text=True,
        timeout=60
    )
    return result.stdout.strip()


In [22]:
if __name__ == "__main__":
    query = input("🔍 Ask a coding question: ").strip()

    print("🔎 Retrieving relevant problems...")
    retrieved = retrieve_similar_problems(query, k=5)

    print("✍️  Building prompt...")
    prompt = build_prompt(retrieved, query)

    print("🧠 Generating answer...")
    response = generate_answer(prompt)

    print("\n✅ Answer:\n")
    print(response)


🔎 Retrieving relevant problems...
✍️  Building prompt...
🧠 Generating answer...

✅ Answer:

A new challenge!

To efficiently solve the "Kth Smallest Product of Two Sorted Arrays" problem, we can use a similar approach to the "Find K Closest Elements" problem.

Here's an optimal solution:
```python
class Solution:
    def kthSmallestProduct(self, arr1: List[int], arr2: List[int], k: int) -> int:
        n = len(arr1)
        m = len(arr2)
        res = float('inf')

        l, r = 0, n - 1
        while l <= r:
            mid = (l + r) // 2
            prod = arr1[mid] * arr2[0]
            if prod < k:
                l = mid + 1
            elif prod > k:
                r = mid - 1
            else:
                break

        for i in range(k):
            if l == n:
                return -1
            res = min(res, arr1[l] * arr2[i])

        return res
```
Here's how it works:

1. Find the middle element of `arr1` and calculate its product with the first element of `arr2`. 

In [1]:
import json
import subprocess
from tqdm import tqdm

# Load your JSON file
with open("rag_minilm_problems.json", "r", encoding="utf-8") as f:
    problems = json.load(f)

def get_approach(code: str) -> str:
    prompt = f"""Given the following optimal solution, what is the main algorithmic approach used?
Choose from common terms like: "Hash Map", "Two Pointers", "Sliding Window", "DFS", "BFS", "Sorting", "Binary Search", "Greedy", "Dynamic Programming", etc.

Code:
{code}

Just return the main approach name."""
    
    proc = subprocess.run(
        [r"C:\Users\prana\AppData\Local\Programs\Ollama\ollama.exe", "run", "llama3"],
        input=prompt.encode(), stdout=subprocess.PIPE
    )
    return proc.stdout.decode().strip().splitlines()[0]

# Add approach field to each problem
for p in tqdm(problems):
    if "approach" not in p or not p["approach"].strip():
        try:
            p["approach"] = get_approach(p["optimal_code"])
        except:
            p["approach"] = "Unknown"

# Save back to file
with open("rag_minilm_problems.json", "w", encoding="utf-8") as f:
    json.dump(problems, f, indent=2, ensure_ascii=False)

print("✅ Done tagging all problems with approaches.")


100%|██████████| 396/396 [07:12<00:00,  1.09s/it]

✅ Done tagging all problems with approaches.



