In [2]:
import os
os.environ['REQUESTS_CA_BUNDLE'] = r"E:\rnakka\Downloads\ca.premierinc.goskope.crt"
os.environ['SSL_CERT_FILE'] = r"E:\rnakka\Downloads\ca.premierinc.goskope.crt"

In [1]:
import os 
os.environ['SSL_CERT_FILE']

'E:\\rnakka\\Downloads\\ca.premierinc.goskope.crt'

Step1 : Fetching Data From Github

In [2]:
import os
from github import Github, GithubException
from dotenv import load_dotenv

load_dotenv()
# --- Configuration ---
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") # Recommended to use environment variables
ORGANIZATION_NAME = "PremierInc"
CERTIFICATE_PATH = r"E:\rnakka\Downloads\ca.premierinc.goskope.crt"
def get_all_repositories(g, org_name):
    """Fetches all repositories for a given GitHub organization."""
    try:
        organization = g.get_organization(org_name)
        return organization.get_repos()
    except GithubException as e:
        print(f"Error fetching organization {org_name}: {e}")
        return []

def get_repository_details(repo):
    """Extracts title, description, README, and recent commits from a repository."""
    repo_details = {
        "name": repo.name,
        "title": repo.full_name,
        "description": repo.description or "",
        "readme": "",
        "commits": []
    }

    # Get README content
    try:
        readme_content = repo.get_contents("README.md")
        repo_details["readme"] = readme_content.decoded_content.decode("utf-8")
    except GithubException:
        print(f"No README.md found in {repo.name}")

    # Get recent commit messages
    try:
        # --- FIX IS HERE ---
        # Get a slice of the first 10 commits. This returns an iterable object.
        recent_commits = repo.get_commits()[:10] 
        
        # Now we can loop through this iterable object directly
        for commit in recent_commits:
            repo_details["commits"].append(commit.commit.message)
            
    except GithubException as e:
        print(f"Could not fetch commits for {repo.name}: {e}")

    return repo_details

if __name__ == "__main__":
    if not GITHUB_TOKEN:
        print("Please set the GITHUB_TOKEN environment variable.")
    else:
        g = Github(GITHUB_TOKEN,verify=CERTIFICATE_PATH)
        repos = get_all_repositories(g, ORGANIZATION_NAME)
        
        all_repo_data = []
        for repo in repos:
            print(f"Fetching details for repository: {repo.full_name}...")
            details = get_repository_details(repo)
            all_repo_data.append(details)
            print(f"Successfully fetched details for {repo.full_name}.")
        
        # At this point, `all_repo_data` holds the information for all your repos.
        # The next step is to process and store this data.
        print(f"\nFetched data for {len(all_repo_data)} repositories.")

Fetching details for repository: PremierInc/fireball-sandbox...
Successfully fetched details for PremierInc/fireball-sandbox.
Fetching details for repository: PremierInc/fireball-shared-sandbox...
No README.md found in fireball-shared-sandbox
Successfully fetched details for PremierInc/fireball-shared-sandbox.
Fetching details for repository: PremierInc/fireball-common-service...
No README.md found in fireball-common-service
Successfully fetched details for PremierInc/fireball-common-service.
Fetching details for repository: PremierInc/fireball-pdfgen-app...
No README.md found in fireball-pdfgen-app
Successfully fetched details for PremierInc/fireball-pdfgen-app.
Fetching details for repository: PremierInc/fireball-common-ui...
Successfully fetched details for PremierInc/fireball-common-ui.
Fetching details for repository: PremierInc/fireball-common-objects...
No README.md found in fireball-common-objects
Successfully fetched details for PremierInc/fireball-common-objects.
Fetching det

Step 2 : Embeeding and Storing Data in ChromaDB

In [3]:
import chromadb
from sentence_transformers import SentenceTransformer

# --- Configuration ---
CHROMA_DB_PATH = "./repo_db"
CHROMA_COLLECTION_NAME = "github_repositories"
EMBEDDING_MODEL = 'all-MiniLM-L6-v2' # A good starting model
BATCH_SIZE = 100 # Process 100 documents at a time for scalability

# --- Initialize ChromaDB and the Embedding Model ---
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)


collection = client.get_or_create_collection(
    name=CHROMA_COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"} 
)


def process_and_store_data(repo_data):
    """
    Processes repository data, creates embeddings, and stores them in ChromaDB using upsert and batching.
    """
    documents = []
    metadatas = []
    ids = []

    def upsert_batch():
        """Helper function to upsert the current batch to ChromaDB."""
        if not documents:
            return
        
        
        collection.upsert(
            documents=documents,
            metadatas=metadatas,
            ids=ids
        )
        print(f"Upserted a batch of {len(documents)} documents.")
        # Clear the lists for the next batch
        documents.clear()
        metadatas.clear()
        ids.clear()


    for repo in repo_data:
        # 1. Title and Description
        
        if repo["title"]:
            text = f"Repository: {repo['title']}\nDescription: {repo['description']}"
            documents.append(text)
            metadatas.append({"repo_name": repo["name"], "source": "description"})
            ids.append(f"repo_{repo['name']}_desc")

        # 2. README (chunking it for better performance)
        if repo["readme"]:
            # Simple chunking by paragraph
            for i, chunk in enumerate(repo["readme"].split("\n\n")):
                if chunk.strip():
                    documents.append(chunk)
                    metadatas.append({"repo_name": repo["name"], "source": "readme_chunk"})
                    ids.append(f"repo_{repo['name']}_readme_{i}")
        
        # 3. Commit Messages
        if repo["commits"]:
            # Clean up commit messages (remove merge commits, etc.)
            clean_commits = [msg.split('\n')[0] for msg in repo["commits"] if not msg.startswith('Merge pull request')]
            commit_text = "\n".join([f"- {msg}" for msg in clean_commits])
            text = f"Recent commit messages for {repo['title']}:\n{commit_text}"
            documents.append(text)
            metadatas.append({"repo_name": repo["name"], "source": "commits"})
            ids.append(f"repo_{repo['name']}_commits")
            
        # --- FIX 3: Check if it's time to process a batch ---
        if len(documents) >= BATCH_SIZE:
            upsert_batch()

    # Process any remaining documents that didn't make a full batch
    upsert_batch()
    
    print(f"\nFinished processing all repositories. Total documents in collection: {collection.count()}")


# --- To run this part, you would first get `all_repo_data` from Step 1 ---

if __name__ == "__main__":
    
    print("Starting data processing and storage...")
    process_and_store_data(all_repo_data)
    print("Data indexing complete.")

  from .autonotebook import tqdm as notebook_tqdm


Starting data processing and storage...
Upserted a batch of 59 documents.

Finished processing all repositories. Total documents in collection: 59
Data indexing complete.


In [6]:
import chromadb
from sentence_transformers import SentenceTransformer
from collections import defaultdict

# --- Configuration ---
CHROMA_DB_PATH = "./repo_db"
CHROMA_COLLECTION_NAME = "github_repositories"
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'

# --- Initialize ---
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
model = SentenceTransformer(EMBEDDING_MODEL)
collection = client.get_collection(name=CHROMA_COLLECTION_NAME)

def find_relevant_repo(issue_description, n_results=5): 
    """
    Finds the most relevant repository for a given issue description using similarity scores.
    """
    # --- FIX 1: Explicitly embed the query with the same model used for indexing ---
    query_embedding = model.encode(issue_description).tolist()

    # Query using the embedding vector, not the text
    results = collection.query(
        query_embeddings=[query_embedding], 
        n_results=n_results,
        include=["metadatas", "distances"] 
    )
    
    # Check for empty results
    if not results or not results.get('ids')[0]:
        return "No relevant documents found in the database."

    # --- FIX 2: Use similarity scores for a more accurate ranking ---
    repo_scores = defaultdict(float)
    
    # results['distances'][0] is a list of distances corresponding to the results
    # results['metadatas'][0] is a list of metadata dictionaries
    for metadata, distance in zip(results['metadatas'][0], results['distances'][0]):
        repo_name = metadata['repo_name']
        
        # Convert distance to a similarity score. For cosine, score = 1 - distance.
        # A smaller distance means a higher score.
        similarity_score = 1.0 - distance
        
        # Add the score to the repository's total
        repo_scores[repo_name] += similarity_score
        
    if not repo_scores:
        return "No relevant repository found."

    # Return the repository with the highest total score
    most_relevant_repo = max(repo_scores, key=repo_scores.get)
    return most_relevant_repo



if __name__ == "__main__":
    # --- Example Query ---
    issue = (
        "I'm having trouble with the fireball application, particularly related to "
        "GridTable sort Icon alignment issue when column header has ellipses."
    )
    
    print(f"Querying for issue: '{issue}'")
    relevant_repo = find_relevant_repo(issue)
    
    if relevant_repo:
        print(f"\n✅ The issue most likely belongs to the '{relevant_repo}' repository.")

Querying for issue: 'I'm having trouble with the fireball application, particularly related to GridTable sort Icon alignment issue when column header has ellipses.'

✅ The issue most likely belongs to the 'fireball-playground' repository.


# Activity map to implement to understand retreival

2. MCP Resource To connecct to copilot agent