In [3]:
import requests
import os
import re
import pandas as pd
from urllib.parse import urlparse, quote
from urllib.parse import urlparse

def parse_markdown_to_csv(md_content, csv_file_path):
    heading_pattern = re.compile(r'^(#+)\s*(.*)', re.MULTILINE)
    headings_contents = []
    current_heading = None
    current_content = []
    
    for line in md_content.split('\n'):
        match = heading_pattern.match(line)
        if match:
            if current_heading is not None:
                headings_contents.append([current_heading, ' '.join(current_content).strip()])
            current_heading = match.group(2).strip()
            current_content = []
        else:
            if line.strip():
                current_content.append(line.strip())
    
    if current_heading is not None:
        headings_contents.append([current_heading, ' '.join(current_content).strip()])
    
    df = pd.DataFrame(headings_contents, columns=['Title', 'Content'])
    df.to_csv(csv_file_path, index=False, encoding='utf-8')

def fetch_and_convert_readme_to_csv(repo_urls, output_dir):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # GitHub API endpoint for fetching the contents of the README file
    for url in repo_urls:
        parsed_url = urlparse(url)
        parts = parsed_url.path.strip('/').split('/')
        repo_user, repo_name = parts[0], parts[1]
        api_url = f"https://api.github.com/repos/{repo_user}/{repo_name}/readme"
        
        # Set up appropriate headers for GitHub API including the token for authorization
        headers = {
            'Accept': 'application/vnd.github.v3.raw',
            'Authorization': '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'  # Replace 'YOUR_GITHUB_TOKEN' with your actual GitHub token
        }
        
        response = requests.get(api_url, headers=headers)
        if response.status_code == 200:
            readme_content = response.text
            csv_file_path = os.path.join(output_dir, f"{repo_name}.csv")
            parse_markdown_to_csv(readme_content, csv_file_path)
            print(f"Processed {repo_name}.csv")
        else:
            print(f"Failed to fetch README for {repo_name}: {response.status_code}")

# Example usage:
repo_urls = [
    'https://github.com/context-labs/autodoc'
]

fetch_and_convert_readme_to_csv(repo_urls, 'output_csv_files')


Processed autodoc.csv


In [1]:
import requests
import os
import pandas as pd
import base64
from urllib.parse import urlparse

def fetch_and_concatenate_source_code(repo_urls, output_dir, token):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    headers = {
        'Authorization': f'token {token}',
        'Accept': 'application/vnd.github.v3.raw'  # Requests raw content directly
    }

    for url in repo_urls:
        parsed_url = urlparse(url)
        parts = parsed_url.path.strip('/').split('/')
        repo_user, repo_name = parts[0], parts[1]

        # Fetch the default branch
        repo_info_url = f'https://api.github.com/repos/{repo_user}/{repo_name}'
        repo_info_response = requests.get(repo_info_url, headers=headers)
        if repo_info_response.status_code == 200:
            default_branch = repo_info_response.json()['default_branch']
        else:
            print(f'Failed to fetch repo info for {repo_name}: {repo_info_response.status_code}')
            continue

        api_url = f'https://api.github.com/repos/{repo_user}/{repo_name}/git/trees/{default_branch}?recursive=true'
        response = requests.get(api_url, headers={'Authorization': f'token {token}', 'Accept': 'application/vnd.github.v3+json'})
        
        if response.status_code == 200:
            data = response.json()
            all_files_content = []

            for file in data['tree']:
                if file['type'] == 'blob' and file['path'].endswith(('.py', '.c', '.cpp', '.java', '.js', '.ts', '.go')):
                    file_url = f"https://api.github.com/repos/{repo_user}/{repo_name}/contents/{file['path']}?ref={default_branch}"
                    file_response = requests.get(file_url, headers=headers)
                    if file_response.status_code == 200:
                        file_content = file_response.text
                        all_files_content.append(file_content)

            concatenated_content = "\n".join(all_files_content)
            df = pd.DataFrame([concatenated_content], columns=['SourceCode'])
            df.to_csv(os.path.join(output_dir, f'{repo_name}_context.csv'), index=False)
            print(f'Saved {repo_name}_context.csv')
        else:
            print(f'Failed to fetch repository data for {repo_name}: {response.status_code}')

# Example usage:
repo_urls = [
    "https://github.com/context-labs/autodoc"
]
output_directory = 'output_csv_files'
github_token = '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'  # Replace with your GitHub access token

fetch_and_concatenate_source_code(repo_urls, output_directory, github_token)


Saved autodoc_context.csv


In [1]:
import requests
import os
import re
import pandas as pd
from urllib.parse import urlparse

# Fetch a list of 200 best GitHub repositories written in Python
def fetch_python_repo_urls():
    repo_urls = []
    headers = {
        'Authorization': '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',  # Replace with your GitHub token
        'Accept': 'application/vnd.github.v3+json'
    }
    query = 'language:python'
    per_page = 100
    total_repos = 300

    for page in range(1, (total_repos // per_page) + 2):
        search_url = f'https://api.github.com/search/repositories?q={query}&sort=stars&order=desc&per_page={per_page}&page={page}'
        response = requests.get(search_url, headers=headers)
        if response.status_code == 200:
            repo_data = response.json()['items']
            repo_urls.extend([repo['html_url'] for repo in repo_data])
        else:
            print(f"Failed to fetch repositories: {response.status_code}")
            break

    return repo_urls

# Parse the README.md content into a CSV
def parse_markdown_to_csv(md_content, csv_file_path):
    heading_pattern = re.compile(r'^(#+)\s*(.*)', re.MULTILINE)
    headings_contents = []
    current_heading = None
    current_content = []

    for line in md_content.split('\n'):
        match = heading_pattern.match(line)
        if match:
            if current_heading is not None:
                headings_contents.append([current_heading, ' '.join(current_content).strip()])
            current_heading = match.group(2).strip()
            current_content = []
        else:
            if line.strip():
                current_content.append(line.strip())

    if current_heading is not None:
        headings_contents.append([current_heading, ' '.join(current_content).strip()])

    df = pd.DataFrame(headings_contents, columns=['Title', 'Content'])
    df.to_csv(csv_file_path, index=False, encoding='utf-8')

# Fetch and convert README files to CSV
def fetch_and_convert_readme_to_csv(repo_urls, output_dir, github_token):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    headers = {
        'Accept': 'application/vnd.github.v3.raw',
        'Authorization': f'token {github_token}'
    }

    for url in repo_urls:
        parsed_url = urlparse(url)
        parts = parsed_url.path.strip('/').split('/')
        repo_user, repo_name = parts[0], parts[1]
        api_url = f'https://api.github.com/repos/{repo_user}/{repo_name}/readme'

        response = requests.get(api_url, headers=headers)
        if response.status_code == 200:
            readme_content = response.text
            csv_file_path = os.path.join(output_dir, f"{repo_name}.csv")
            parse_markdown_to_csv(readme_content, csv_file_path)
            print(f"Processed {repo_name}.csv")
        else:
            print(f"Failed to fetch README for {repo_name}: {response.status_code}")

# Usage
github_token = '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'  # Replace with your actual GitHub Personal Access Token
repo_urls = fetch_python_repo_urls()
output_directory = 'output_csv_files'
fetch_and_convert_readme_to_csv(repo_urls, output_directory, github_token)


Processed system-design-primer.csv
Processed awesome-python.csv
Processed Python.csv
Processed stable-diffusion-webui.csv
Processed youtube-dl.csv
Processed transformers.csv
Processed HelloGitHub.csv
Processed langchain.csv
Processed thefuck.csv
Processed pytorch.csv
Processed django.csv
Processed models.csv
Processed yt-dlp.csv
Processed core.csv
Processed flask.csv
Processed funNLP.csv
Processed ansible.csv
Processed keras.csv
Processed whisper.csv
Processed scikit-learn.csv
Processed d2l-zh.csv
Processed llama.csv
Processed private-gpt.csv
Processed requests.csv
Processed gpt-engineer.csv
Processed screenshot-to-code.csv
Processed faceswap.csv
Processed you-get.csv
Processed openpilot.csv
Processed rich.csv
Processed big-list-of-naughty-strings.csv
Processed pandas.csv
Processed CppCoreGuidelines.csv
Processed MetaGPT.csv
Processed python-patterns.csv
Processed PaddleOCR.csv
Processed ailearning.csv
Processed black.csv
Processed Deep-Learning-Papers-Reading-Roadmap.csv
Processed sen

In [8]:
import requests
import os
import pandas as pd
import base64
import time
from urllib.parse import urlparse

# Fetch 200 best Python GitHub repo URLs
def fetch_python_repo_urls(token, total_repos=300):
    repo_urls = []
    headers = {
        'Authorization': f'token {token}',
        'Accept': 'application/vnd.github.v3+json'
    }
    query = 'language:python'
    per_page = 100

    for page in range(1, (total_repos // per_page) + 2):
        search_url = f'https://api.github.com/search/repositories?q={query}&sort=stars&order=desc&per_page={per_page}&page={page}'
        response = requests.get(search_url, headers=headers)
        if response.status_code == 200:
            repo_data = response.json()['items']
            repo_urls.extend([repo['html_url'] for repo in repo_data])
        else:
            print(f"Failed to fetch repositories: {response.status_code}, {response.text}")
            break

    return repo_urls

# Function to check API rate limit
def check_rate_limit(headers):
    rate_limit_url = "https://api.github.com/rate_limit"
    response = requests.get(rate_limit_url, headers=headers)
    if response.status_code == 200:
        rate_limit_data = response.json()
        remaining = rate_limit_data["resources"]["core"]["remaining"]
        print(f"Remaining API requests: {remaining}")
        return remaining
    else:
        print(f"Failed to check rate limit: {response.status_code}, {response.text}")
        return 0

# Retry logic wrapper
def retry_request(url, headers, retries=3, delay=2):
    for attempt in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response
        print(f"Attempt {attempt + 1} failed: {response.status_code}, {response.text}. Retrying in {delay} seconds...")
        time.sleep(delay)
    return response

# Function to fetch and concatenate source code, and save to CSV
def fetch_and_concatenate_source_code(repo_urls, output_dir, token):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    headers = {
        'Authorization': f'token {token}',
        'Accept': 'application/vnd.github.v3.raw'  # Requests raw content directly
    }

    if check_rate_limit(headers) == 0:
        print("API rate limit exceeded. Try again later.")
        return

    for url in repo_urls:
        parsed_url = urlparse(url)
        parts = parsed_url.path.strip('/').split('/')
        repo_user, repo_name = parts[0], parts[1]

        # Fetch the default branch
        repo_info_url = f'https://api.github.com/repos/{repo_user}/{repo_name}'
        repo_info_response = retry_request(repo_info_url, headers)
        if repo_info_response.status_code == 200:
            default_branch = repo_info_response.json().get('default_branch', 'main')
        else:
            print(f'Failed to fetch repo info for {repo_name}: {repo_info_response.status_code}, {repo_info_response.text}')
            continue

        api_url = f'https://api.github.com/repos/{repo_user}/{repo_name}/git/trees/{default_branch}?recursive=true'
        response = retry_request(api_url, headers={'Authorization': f'token {token}', 'Accept': 'application/vnd.github.v3+json'})

        if response.status_code == 200:
            try:
                data = response.json()
                all_files_content = []

                for file in data['tree']:
                    if file['type'] == 'blob' and file['path'].endswith(('.py', '.c', '.cpp', '.java', '.js', '.ts', '.go')):
                        file_url = f"https://api.github.com/repos/{repo_user}/{repo_name}/contents/{file['path']}?ref={default_branch}"
                        file_response = retry_request(file_url, headers)
                        if file_response.status_code == 200:
                            file_data = file_response.json()
                            if 'content' in file_data:
                                try:
                                    file_content = base64.b64decode(file_data['content'])
                                    try:
                                        file_content = file_content.decode('utf-8')
                                        if file_content:
                                            all_files_content.append(file_content)
                                        else:
                                            print(f"File {file['path']} from {repo_name} is empty. Skipping...")
                                    except UnicodeDecodeError:
                                        print(f"Failed to decode file {file['path']} from {repo_name} as UTF-8. Skipping...")
                                except (ValueError, UnicodeDecodeError) as e:
                                    print(f"Failed to decode file {file['path']} from {repo_name}: {e}. Skipping...")
                            else:
                                print(f"File {file['path']} from {repo_name} does not have content. Skipping...")
                        else:
                            print(f"Failed to fetch file {file['path']} from {repo_name}: {file_response.status_code}, {file_response.text}")

                if all_files_content:
                    concatenated_content = "\n".join(all_files_content)
                    df = pd.DataFrame([concatenated_content], columns=['SourceCode'])
                    df.to_csv(os.path.join(output_dir, f'{repo_name}_context.csv'), index=False)
                    print(f'Saved {repo_name}_context.csv')
                else:
                    print(f"No files could be processed for {repo_name}. Skipping...")
            except Exception as e:
                print(f"Error processing {repo_name}: {e}")
        else:
            print(f'Failed to fetch repository data for {repo_name}: {response.status_code}, {response.text}')

# Fetch 200 best Python repositories
github_token = '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'  # Replace with your GitHub Personal Access Token
repo_urls = fetch_python_repo_urls(github_token, total_repos=300)

# Process the repositories and save source code to CSV
output_directory = 'context_csv_files'
fetch_and_concatenate_source_code(repo_urls, output_directory, github_token)

Remaining API requests: 4319
Error processing public-apis: Expecting value: line 1 column 1 (char 0)
Error processing system-design-primer: Expecting value: line 1 column 1 (char 0)
Error processing awesome-python: Expecting value: line 1 column 1 (char 0)
Error processing Python: Expecting value: line 1 column 1 (char 0)
Error processing Python-100-Days: Extra data: line 1 column 3 (char 2)
Error processing stable-diffusion-webui: Expecting value: line 1 column 1 (char 0)
Error processing youtube-dl: Expecting value: line 1 column 1 (char 0)
Error processing transformers: Expecting value: line 1 column 1 (char 0)
Error processing HelloGitHub: Expecting value: line 1 column 1 (char 0)
Error processing langchain: Expecting value: line 1 column 1 (char 0)
Error processing thefuck: Expecting value: line 1 column 1 (char 0)
Error processing pytorch: Expecting value: line 1 column 1 (char 0)
Error processing django: Expecting value: line 1 column 1 (char 0)
Error processing models: Expectin

KeyboardInterrupt: 

In [11]:
import pickle
import os


# Save the list to a pickle file
with open('repo_urls.pickle', 'wb') as file:
    pickle.dump(repo_urls, file)
print(repo_urls)


['https://github.com/public-apis/public-apis', 'https://github.com/donnemartin/system-design-primer', 'https://github.com/vinta/awesome-python', 'https://github.com/TheAlgorithms/Python', 'https://github.com/jackfrued/Python-100-Days', 'https://github.com/AUTOMATIC1111/stable-diffusion-webui', 'https://github.com/ytdl-org/youtube-dl', 'https://github.com/huggingface/transformers', 'https://github.com/521xueweihan/HelloGitHub', 'https://github.com/langchain-ai/langchain', 'https://github.com/nvbn/thefuck', 'https://github.com/pytorch/pytorch', 'https://github.com/django/django', 'https://github.com/tensorflow/models', 'https://github.com/yt-dlp/yt-dlp', 'https://github.com/tiangolo/fastapi', 'https://github.com/home-assistant/core', 'https://github.com/pallets/flask', 'https://github.com/fighting41love/funNLP', 'https://github.com/bregman-arie/devops-exercises', 'https://github.com/josephmisiti/awesome-machine-learning', 'https://github.com/ansible/ansible', 'https://github.com/keras-te