<a href="https://colab.research.google.com/github/rishidadia/github-dev-rag-analyzer/blob/main/GithubRag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DOWNLOADS, IMPORTS

---



In [None]:
!pip install -U llama-index
!pip install sentence-transformers
!pip install PyGithub
!pip install -U llama-index-embeddings-huggingface

In [None]:
from llama_index.core import Document, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from github import Github

SETTING UP GITHUB LIRARY

In [None]:
from google.colab import userdata
token_name=userdata.get('GITHUB_TOKEN')
g=Github(token_name)
print(g.get_rate_limit().resources.core)
# print(token_name)

In [None]:
# user_name=input("Enter username")
user_name='viraj-ap'
user=g.get_user(user_name)

CREATING DOCUMENTS FOR THE PULLED DATA

In [None]:
from llama_index.core import Document
def make_doc(
    *,
    text:str,
    node_type:str,
    username:str,
    repo:str|None=None,
    year:int|None=None,
    extra_meta:str|None=None,
):
  metadata={
      "type":node_type,
      "username":username,
  }
  if repo:
    metadata['repo']=repo
  if year:
    metadata['year']=year
  if extra_meta:
    metadata['extra_meta']=extra_meta
  return Document(
      text=text.strip(),
      metadata=metadata
  )

In [None]:
profile_text=f"""
  TYPE:UserProfile
  USERNAME:{user.login}

  Name:{user.name}
  Bio:{user.bio}
  Company:{user.company}
  Location:{user.location}
  Public Repositiries:{user.public_repos}
  Followers:{user.followers}
  Following:{user.following}

  Summary:
  Doc for Github profile and it's public identity
"""
user_profile_doc=make_doc(
    text=profile_text,
    node_type="user_profile",
    username=user.login
)

In [None]:
repo_docs=[]
readme_docs=[]
commit_docs=[]
for repo in user.get_repos():
  #-----REPO SUMMARY-------
  repo_summary_text=f"""
      TYPE:RepoSummary
      REPO:{repo.name}

      Description:{repo.description}
      Primary Language:{repo.language}
      Forks:{repo.forks_count}
      Open Issues:{repo.open_issues_count}
      Created at:{repo.created_at}
      Last updated:{repo.updated_at}

      Summary:
      This repo represents a core repo made and maintained by the user
  """
  repo_summary_doc=make_doc(
      text=repo_summary_text,
      node_type="repo_summary",
      username=user.login,
      repo=repo.name
  )
  repo_docs.append(repo_summary_doc)

  #----README SUMMARY----
  try:
    readme=repo.get_readme().decoded_content.decode('utf-8')[:4000]
  except:
    readme="ReadMe not available"

  readme_summary_text=f"""
      TYPE:ReadMESummary
      REPO:{repo.name}

      README CONTENT:{readme}

      Sumarry: This document explains the intent, scope, tech stack and notes from the user regarding the repo
  """

  readme_summary_doc=make_doc(
      text=readme_summary_text,
      node_type="repo_documentation",
      username=user.login,
      repo=repo.name,
  )
  readme_docs.append(readme_summary_doc)


  #---COMMIT SUMMARY---
  commit_messages=[]
  additions=deletions=0

  for commit in repo.get_commits()[:50]:
      commit_messages.append(commit.commit.message.split("\n")[0])
      if commit.stats:
          additions += commit.stats.additions
          deletions += commit.stats.deletions

  commit_behavior_text = f"""
  TYPE: CommitBehavior
  REPO: {repo.name}

  Total Commits Analyzed: {len(commit_messages)}
  Total Additions: {additions}
  Total Deletions: {deletions}

  Sample Commit Messages:
  - """ + "\n- ".join(commit_messages[:10]) + """

  Analysis:
  This document reflects the coding activity, work intensity, and contribution depth of the user.
  """

  commit_behavior_doc = make_doc(
      text=commit_behavior_text,
      node_type="commit_behavior",
      username=user.login,
      repo=repo.name
  )
  commit_docs.append(commit_behavior_doc)


In [None]:
all_docs=(
    [user_profile_doc]+
    repo_docs+
    readme_docs+
    commit_docs
)

In [None]:
print(type(all_docs[0]))
print(isinstance(all_docs[0], list))

SUMMARY FOR THE USER ACCOUNT

In [None]:
evidence_bloc=repo_docs+readme_docs+commit_docs
evidence_text="\n\n--\n\n".join(doc.text for doc in evidence_bloc[:15])

In [None]:
allowed_domains=[
    "machine learning",
    "automation",
    "data science",
    "backend systems",
    "frontend",
    "devops",
    "mlops",
    "fintech",
    "cybersecurity",
    "embedded systems",
    "full stack app developer",
    "full stack web developer",
    "iot",
    "blockchain",
    "ar/vr",
    "robotics",
    "quantum computing"
]

In [None]:
domain_prompt=f"""
  You are a technical evaluator.
  You are provided with repository level information that includes readme, commit history and overall type of repository.

  You're task is to evaluate domain coverage, what in that domain is covered, and how deep they have gone in that domain.
  Accordingly signal as Beginner/ Intermediate or Pro.
  You are to give evidence for this by calling which repo you are referring to and what in that repo made you signal what you did.
  This evidence is to be positive only, even if someone is a beginner, you will not mention the things that are absent, only what is present.
  Also mention the number of repos you analyzed in the beginning.

  Allowed domains={allowed_domains}

  You will also mention the programming languages used in order of usage.

  Lastly from all this information, you will give what market or sector the user is best fit to work in. What sort of companies can the user target
  based on their level of understanding of the domain. The level of the company has to be synonymous the skill level of the user. A beginner will probably not get into MAANG.

  This is the information given to you of the user:{evidence_text}

  Output format(strict):
  domain_name:level(Beginner/ Intermediate or Pro)
  proof for it
  what in that specific domain has been covered by the username.

  Frequently used programming languages:
  language:what is coded in that language

  Industry fit:
  What industry can they work in.
  What companies can they aim for.
"""

In [None]:
!pip install -q google-generativeai

In [None]:
from google.colab import userdata
gemini_key=userdata.get('GEMINI_KEY')

In [None]:
import google.generativeai as genai
genai.configure(api_key=gemini_key)

In [None]:
model=genai.GenerativeModel(
    model_name="gemini-2.5-flash",
    generation_config={
        "temperature":0,
    }
)

In [None]:
response=model.generate_content(domain_prompt)
ans=response.text
print(ans)

In [None]:
user_summary_doc=make_doc(
    text=ans,
    username=user.login,
    node_type="user_summary"
)
# all_docs.append(user_summary_doc)
print(type(all_docs[-1]))

SETTING UP GEMINI FOR THE RAG

In [None]:
pip install -q llama-index-llms-gemini


In [None]:
from llama_index.llms.gemini import Gemini
from google.colab import userdata
# api_key=userdata.get(gemini_key)
gemini_llm=Gemini(
    model="gemini-2.5-flash",
    api_key=gemini_key,
    temperature=0.5
)

In [None]:
from llama_index.core import VectorStoreIndex
embed_model=HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
index=VectorStoreIndex(
    all_docs,
    embed_model=embed_model,
    # llm=gemini_llm
)
# query_engine=index.as_query_engine(llm=gemini_llm)

SETTING UP VALHALLA FOR QUESTION-NODE CONNECTION

In [None]:
pip install -q transformers torch

In [None]:
ROUTE_LABELS = [
    "global_profile_question",
    "developer_overview",
    "domain_expertise",
    "repository_detail",
    "contribution_behavior"
]
LABEL_TO_NODE_TYPES = {
  "global_profile_question": ["user_profile"],
  "developer_overview": ["user_profile"],
  "repository_detail": ["repo_docs"],
  "contribution_behavior": ["commit_docs"]
}


In [None]:
from transformers import pipeline

router = pipeline(
    "zero-shot-classification",
    model="valhalla/distilbart-mnli-12-1",
    device=-1
)


In [None]:
result = router(
    "How many repositories does this developer have?",
    candidate_labels=ROUTE_LABELS
)

print(result)


In [None]:
def route_question(question: str, threshold=0.23):
    result = router(
        question,
        candidate_labels=ROUTE_LABELS,
        multi_label=False
    )

    label = result["labels"][0]
    score = result["scores"][0]

    if score < threshold:
        return None

    return LABEL_TO_NODE_TYPES[label]
print(route_question("Tell me about this user's repos"))

FINAL RAG

In [None]:
from llama_index.core.vector_stores.types import (
    MetadataFilters,
    MetadataFilter
)
question = "Tell me about this user's repos"

node_types = route_question(question)

if node_types:
    filters = MetadataFilters(
        filters=[MetadataFilter(key="type", value=t) for t in node_types]
    )
    qe = index.as_query_engine(llm=gemini_llm, filters=filters)
else:
    qe = index.as_query_engine(llm=gemini_llm)
retriever = index.as_retriever(
    filters=filters,
    similarity_top_k=5
)

nodes = retriever.retrieve(question)

# print("NUMBER OF DOCS:", len(nodes))
# for i, n in enumerate(nodes):
#     print(f"\n--- DOC {i} ---")
#     print("TYPE:", n.node.metadata)
#     print("TEXT LENGTH:", len(n.node.text))
#     print(n.node.text)

print(qe.query(question))
