In [85]:
import requests
import pymongo
import json
import os
import base64

from typing import TypedDict
from google import genai
from google.genai import types
from dotenv import load_dotenv
from pymongo.operations import SearchIndexModel

load_dotenv()

True

In [75]:
google_client = genai.Client(api_key=os.getenv("GEMINI"))

In [76]:
uri = os.getenv("MONGODB_URI")
mongo_client = pymongo.MongoClient(uri, server_api=pymongo.server_api.ServerApi(
   version="1", strict=True, deprecation_errors=True))

Prakharbase = mongo_client["Prakharbase"]
vector_database = Prakharbase["vector_database"]

In [77]:
GITHUB_USERNAME = "prakhargaming"
GITHUB_TOKEN = os.getenv("REPO")

headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

In [78]:
class __repo_old:
    def __init__(self, name, url, languages, topics, readme):
        self.name = name
        self.url = url
        self.languages = languages
        self.topics = topics
        self.readme = readme

class repo(TypedDict):
    name: str
    url: str
    languages: dict[str, int]
    topics: list[str]
    readme: str
    embedding: list[float]

In [79]:
GITHUB_USERNAME = "prakhargaming"
GITHUB_TOKEN = os.getenv("REPO")

headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

In [80]:
def generate_desc(name="", url="", languages="", tags="", readme=""):
    return f"""
# METADATA
Repository name: {name}
Repository URL: {url}
Repository languages: {languages}
Repository topics: {tags}

# README:
{readme}"""

In [81]:
def auto_tag(readme_text, languages) -> list[str]:
    tags = []

    # Keywords for different skill areas
    computer_vision_keywords = ["opencv", "cnn", "image", "vision", "detection", "segmentation", "recognition"]
    nlp_keywords = ["bert", "transformer", "token", "nlp", "text classification", "language model"]
    web_dev_keywords = ["react", "flask", "django", "express", "api", "frontend", "backend", "web app"]
    data_science_keywords = ["pandas", "numpy", "dataframe", "analysis", "plot", "visualization"]
    ai_keywords = ["deep learning", "machine learning", "reinforcement learning", "model", "training"]

    text = readme_text.lower()

    # Helper function
    def contains_any(keywords):
        return any(keyword in text for keyword in keywords)

    # Tagging based on content
    if contains_any(computer_vision_keywords) or 'OpenCV' in languages:
        tags.append("computer-vision")
    if contains_any(nlp_keywords):
        tags.append("nlp")
    if contains_any(web_dev_keywords):
        tags.append("web-development")
    if contains_any(data_science_keywords):
        tags.append("data-science")
    if contains_any(ai_keywords):
        tags.append("artificial-intelligence")

    return tags

In [82]:
def fetch_public_repo_information(username: str, generate_embeddings=False, generate_files=False) -> dict[str, repo]:
    repo_url = f"https://api.github.com/users/{username}/repos"
    request_repo = requests.get(repo_url, headers=headers)
    if request_repo.status_code != 200:
        print(f"Request Failed (request_repo): {request_repo.status_code} \n {repo_url}")
        return request_repo.status_code
    data = request_repo.json()
    repo_info = []
    if generate_files:
        directory = "github_repos_info"
        os.makedirs(directory, exist_ok=True)
    for repos in data:
        repo_name = repos["name"]
        repo_url = repos["url"]
        language_url = f"https://api.github.com/repos/{username}/{repo_name}/languages"
        readme_url = f"https://api.github.com/repos/{username}/{repo_name}/readme"

        request_languages = requests.get(language_url, headers=headers)
        if request_languages.status_code == 200:     
            repo_languages = request_languages.json()
        else:
            print(f"Request Failed (request_languages): {request_languages.status_code} \n {language_url}")
            repo_languages = {}

        request_readme = requests.get(readme_url, headers=headers)
        if request_readme.status_code == 200:
            readme_content = request_readme.json()
            repo_readme = base64.b64decode(readme_content["content"]).decode('utf-8')
        else:
            print(f"Request Failed (request_readme): {request_readme.status_code} \n {readme_url}")
            repo_readme = ""
        
        repo_tags = auto_tag(repo_readme, repo_languages)
        
        if generate_embeddings:
            to_embed = generate_desc(repo_name, repo_url, repo_languages, repo_tags, repo_readme)
            result = google_client.models.embed_content(
                model="text-embedding-004",
                contents=to_embed,
                config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
            )
            repo_embedding = result.embeddings[0].values
        else:
            repo_embedding = []

        if generate_files:
            file_path = f"github_repos_info\\REPO_INFO_{repo_name}.txt"
            file_contents = generate_desc(repo_name, repo_url, repo_languages, repo_tags, repo_readme)
            try:
                with open(file_path, "w") as file:
                    file.write(file_contents)
                print(f"File '{file_path}' created successfully.")
            except Exception as e:
                print(f"An error occurred: {e}")

        repo_info.append(
            repo(
                name=repo_name,
                url=repo_url,
                languages=repo_languages,
                topics=auto_tag(repo_readme, repo_languages),
                readme=repo_readme,
                embedding=repo_embedding
            )
        )

    return repo_info

In [83]:
repos = fetch_public_repo_information(GITHUB_USERNAME, generate_embeddings=True, generate_files=True)

Request Failed (request_readme): 404 
 https://api.github.com/repos/prakhargaming/amazonInterview/readme
File 'github_repos_info\REPO_INFO_amazonInterview.txt' created successfully.
File 'github_repos_info\REPO_INFO_Data-Visualization-Web-Dev-Project.txt' created successfully.
File 'github_repos_info\REPO_INFO_FastSAM-needle-biopsy.txt' created successfully.
An error occurred: 'charmap' codec can't encode character '\u0259' in position 1143: character maps to <undefined>
File 'github_repos_info\REPO_INFO_flask-react-template.txt' created successfully.
File 'github_repos_info\REPO_INFO_GenAI_Catagorization_Engine.txt' created successfully.
Request Failed (request_readme): 404 
 https://api.github.com/repos/prakhargaming/Lab-thingy/readme
File 'github_repos_info\REPO_INFO_Lab-thingy.txt' created successfully.
Request Failed (request_readme): 404 
 https://api.github.com/repos/prakhargaming/musicdiscordplaylistbot/readme
File 'github_repos_info\REPO_INFO_musicdiscordplaylistbot.txt' creat

In [84]:
vector_database.insert_many(repos)

InsertManyResult([ObjectId('681119164e98ea887be6b50d'), ObjectId('681119164e98ea887be6b50e'), ObjectId('681119164e98ea887be6b50f'), ObjectId('681119164e98ea887be6b510'), ObjectId('681119164e98ea887be6b511'), ObjectId('681119164e98ea887be6b512'), ObjectId('681119164e98ea887be6b513'), ObjectId('681119164e98ea887be6b514'), ObjectId('681119164e98ea887be6b515'), ObjectId('681119164e98ea887be6b516'), ObjectId('681119164e98ea887be6b517'), ObjectId('681119164e98ea887be6b518'), ObjectId('681119164e98ea887be6b519'), ObjectId('681119164e98ea887be6b51a'), ObjectId('681119164e98ea887be6b51b'), ObjectId('681119164e98ea887be6b51c'), ObjectId('681119164e98ea887be6b51d'), ObjectId('681119164e98ea887be6b51e'), ObjectId('681119164e98ea887be6b51f'), ObjectId('681119164e98ea887be6b520'), ObjectId('681119164e98ea887be6b521'), ObjectId('681119164e98ea887be6b522'), ObjectId('681119164e98ea887be6b523')], acknowledged=True)

In [None]:
search_index_model = SearchIndexModel(
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "similarity": "cosine",  # or "cosine", depending on your use case
                "numDimensions": 768         # this must match your embedding model output size
            }
        ]
    },
    name="vector_index",
    type="vectorSearch"
)

vector_database.create_search_index(model=search_index_model)