In [71]:
%pip install pygithub lancedb sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [72]:
from github import Github
from itertools import islice    #con of popular repos

In [None]:
g=Github("your_github_token")

In [74]:
repo = g.get_repo("NVIDIA/physicsnemo")    

In [75]:
raw_issues = [
    {
        "number": issue.number,
        "title": issue.title,
        "body": issue.body or "",
        "url": issue.html_url
    }
    for issue in islice(repo.get_issues(state="all"), 20)
    if issue.pull_request is None
]

In [76]:
#embeddings
from sentence_transformers import SentenceTransformer

In [77]:
model =SentenceTransformer("all-MiniLM-L6-v2")
texts = [f"{issue['title']} {issue['body']}" for issue in raw_issues]
embeddings = model.encode(texts)

In [89]:
#lancedb baby
import lancedb
import pandas as pd
import numpy as np

# Convert embeddings to float32 for proper vector storage
embeddings = np.array(embeddings, dtype=np.float32)


In [90]:
db=lancedb.connect("github-issues")
df = pd.DataFrame([
    {
        "id": i,
        "text": texts[i],
        "embedding": embeddings[i].tolist(),  # Ensure it's a list of floats (vector)
        "url": raw_issues[i]["url"],
        "number": raw_issues[i]["number"]
    }
    for i in range(len(texts))
])


In [91]:
table = db.create_table("issues", data=df, mode="overwrite")

In [92]:
#for similarity search we are using faiss,cause there were issue with vector search with lancedb
import faiss
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance metric
index.add(embeddings)

In [102]:
#query
query = "authentication error"
query_embedding = model.encode([query])[0].astype(np.float32)
k = 5
D, I = index.search(np.array([query_embedding]), k)
results = [raw_issues[i]["title"] for i in I[0]]
print(results)

["🐛[BUG]: TypeError: RegressionLoss.__call__() got an unexpected keyword argument 'use_patch_grad_acc'", 'Question : CorrDiff Loss Function (SSE or MSE)', '🐛[BUG]: @StaticCaptureEvaluateNoGrad decorator can cause NaN values to show up during inference', '📚[DOC]: CorrDiff Validation and Early Stopping', '🚀[FEA]: Shall I use HRRR datasets in natural/hybrid model levels when training the StormCast model?']


In [103]:
context = "\n\n".join([f"Issue {i+1}:\n{body}" for i, body in enumerate(results)])

In [104]:
prompt = f"""
You are a helpful assistant. Here are some GitHub issues:

{context}
Summarize the common problems or patterns described in these issues.
"""

In [None]:
import os
os.environ["GOOGLE_API_KEY"] = "api key"

In [106]:
from agno.agent import Agent
from agno.models.google import Gemini  
from agno.tools.reasoning import ReasoningTools

In [None]:
agent = Agent(
    model=Gemini(id="models/gemini-2.0-flash-lite"),
    tools=[ReasoningTools(add_instructions=True)],
    instructions=["Identify important themes, group similar issues, and highlight core problems in one line"],
    markdown=True,
)

In [110]:
response = agent.run(prompt)
print("\n=== Gemini Summary ===\n")
print(response)


=== Gemini Summary ===

RunResponse(content="Okay, I've reviewed the issues. Here's a breakdown of the common problems and patterns:\n\n*   **Bug Reports:** Issues 1 and 3 are clearly bug reports. Issue 1 describes a `TypeError` related to an unexpected keyword argument, while Issue 3 mentions `NaN` values appearing during inference due to a decorator. These are specific technical issues within the code.\n*   **Loss Function and Training:** Issues 2 and 4 relate to the loss function and the training process. Issue 2 asks a question about the loss function, specifically whether to use SSE or MSE, and Issue 4 discusses validation and early stopping related to the CorrDiff loss function.\n*   **Feature Request:** Issue 5 is a feature request concerning the use of HRRR datasets within a model during training.\n\nBased on this, the core problems or patterns seem to be:\n\n1.  **Software Bugs:** Specific errors and unexpected behavior within the code (Issues 1 and 3).\n2.  **Model Training/