In [None]:
import requests
import re
import chardet

In [None]:
def get_user_repositories(github_url):
    # Extract the username from the GitHub URL
    username = github_url.split("/")[-1]

    # Make the API request to retrieve the user's repositories
    url = f"https://api.github.com/users/{username}/repos"
    headers = {"Accept": "application/vnd.github.v3+json"}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        # Parse the JSON response and extract the repository names and URLs
        repositories = []
        data = response.json()
        for repo in data:
            repo_name = repo["name"]
            repo_url = repo["html_url"]
            repositories.append({"name": repo_name, "url": repo_url})

        return repositories
    else:
        # Handle API request errors
        print("Error: Failed to fetch user repositories.")
        return []



In [None]:
def preprocess_code(repository):
    processed_files = set()
    contents = preprocess_files(repository)
    preprocessed_contents = []
    for file in contents:
        file_type = file["type"]
        content = file["content"]
        if file_type == "jupyter_notebook":
            preprocessed_contents.append(preprocess_jupyter_notebook(content))
        elif file_type == "package_file":
            preprocessed_contents.append(preprocess_package_file(content))
        elif file_type == "regular_file":
            preprocessed_contents.append(preprocess_regular_file(content))

    return preprocessed_contents

def preprocess_files(repository):
    files = fetch_repository_files(repository)
    contents = []
    for file in files:
        file_path = file["name"]
        content = fetch_file_content(file["download_url"])
        contents.append({"name": file_path, "type": file["type"], "content": content})

    return contents


In [None]:

def fetch_repository_files(repository):
    url = f"https://api.github.com/repos/suhasml/{repository['name']}/contents"
    headers = {"Accept": "application/vnd.github.v3+json"}
    response = requests.get(url, headers=headers, timeout=10)

    if response.status_code == 200:
        files = []
        data = response.json()
        fetch_files_recursive(data, files)

        return files
    else:
        print(f"Error: Failed to fetch files in repository {repository['name']}.")
        return []

def fetch_files_recursive(data, files):
    for item in data:
        if item["type"] == "file":
            file_name = item["name"]
            file_extension = file_name.split(".")[-1].lower()
            if file_extension not in ["jpg", "jpeg", "png", "gif", "ico", "h5", "pkl", "gitignore", "json", "node"]:
                file_type = determine_file_type(file_name)
                files.append({"name": file_name, "type": file_type, "download_url": item["download_url"]})
        elif item["type"] == "dir":
            url = item["url"]
            headers = {"Accept": "application/vnd.github.v3+json"}
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                subdir_data = response.json()
                fetch_files_recursive(subdir_data, files)
            else:
                print(f"Error: Failed to fetch files in directory {item['name']}.")



In [None]:
def determine_file_type(file_name):
    if file_name.endswith(".ipynb"):
        return "jupyter_notebook"
    elif file_name.endswith(".py"):
        return "package_file"
    elif file_name.endswith(".h5") or file_name.endswith(".pkl"):
        return "binary_file"
    else:
        return "regular_file"

def fetch_file_content(download_url):
    response = requests.get(download_url)

    if response.status_code == 200:
        return response.content
    else:
        print(f"Error: Failed to fetch file content from {download_url}.")
        return None


In [None]:

def preprocess_jupyter_notebook(content):
    notebook = nbformat.reads(content, nbformat.NO_CONVERT)
    preprocessed_cells = []
    for cell in notebook.cells:
        if cell.cell_type == "code":
            preprocessed_cells.append(preprocess_code_cell(cell))

    return preprocessed_cells

def preprocess_package_file(content):
    # Implement your preprocessing logic for package files
    # You can limit the token count or chunk the file as necessary
    # Example: Limit the token count to 1000
    if len(content.split()) > 1000:
        content = " ".join(content.split()[:1000])

    return content

def preprocess_regular_file(content):
    result = chardet.detect(content)
    encoding = result["encoding"]

    if encoding is None:
        encoding = "utf-8"

    try:
        decoded_content = content.decode(encoding, errors="ignore")
        if len(decoded_content.split()) > 500:
            decoded_content = " ".join(decoded_content.split()[:500])

        return decoded_content
    except UnicodeDecodeError:
        print("Error: Failed to decode file content.")

def preprocess_code_cell(cell):
    # Implement your preprocessing logic for code cells within Jupyter notebooks
    # You can limit the token count or handle large code cells as necessary
    # Example: Limit the token count to 200
    if len(cell["source"].split()) > 200:
        cell["source"] = " ".join(cell["source"].split()[:200])

    return cell["source"]