In [22]:
!pip install pandas requests dotenv

Collecting dotenv
  Using cached dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Using cached dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.2.1-py3-none-any.whl (21 kB)
Installing collected packages: python-dotenv, dotenv
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [dotenv]
[1A[2KSuccessfully installed dotenv-0.9.9 python-dotenv-1.2.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
import os
from dotenv import load_dotenv

load_dotenv()
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')

if not GITHUB_TOKEN:
    raise RuntimeError("GITHUB_TOKEN não encontrado no .env. Adicione uma linha: GITHUB_TOKEN=seu_token")

HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}

# Gerar CSV pra usar na analise dos repositorios

In [None]:
import requests
import csv
import time

# Função para buscar repositórios, agora com debug e URL encoding
def buscar_repositorios(paginas=5):
    repositorios = {}
    url = 'https://api.github.com/search/code'
    query = 'import "k6" OR import \'k6\' language:JavaScript OR language:TypeScript'

    for page in range(1, paginas + 1):
        params = {'q': query, 'per_page': 100, 'page': page}
        print(f'Buscando página {page} com query: {params["q"]}')  # DEBUG
        response = requests.get(url, headers=HEADERS, params=params)
        print(f'Status code: {response.status_code}')  # DEBUG

        if response.status_code == 200:
            data = response.json()
            if 'items' not in data or len(data['items']) == 0:
                print(f'Nenhum resultado encontrado na página {page}. Encerrando busca.')
                break
            for item in data['items']:
                repo_name = item['repository']['full_name']
                repo_url = item['repository']['html_url']
                file_path = item['path']
                if repo_name not in repositorios:
                    repositorios[repo_name] = {'url': repo_url, 'arquivos': []}
                repositorios[repo_name]['arquivos'].append(file_path)
        elif response.status_code == 403:
            print(f'Limite de taxa atingido. Aguardando 60 segundos.')
            time.sleep(60)
            page -= 1  # Retry current page
            continue
        elif response.status_code == 422:
            print(f'Erro 422 (Unprocessable Entity) na página {page}. Isso geralmente indica que o limite de 1000 resultados da API de busca do GitHub foi atingido ou a consulta é inválida. Encerrando busca.')
            break  # Stop if we hit this error
        else:
            print(f'Erro inesperado na requisição na página {page}: {response.status_code}. Encerrando busca.')
            break

    return repositorios

# Função para salvar CSV permanece igual
def salvar_csv(repositorios, arquivo_saida='repositorios_k6.csv'):
    with open(arquivo_saida, mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['repositório', 'url', 'arquivos']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for repo_name, info in repositorios.items():
            writer.writerow({'repositório': repo_name, 'url': info['url'], 'arquivos': '; '.join(info['arquivos'])})
    print(f'Resultados salvos em {arquivo_saida}')


repositorios = buscar_repositorios(paginas=5)
print(f'Total de repositórios encontrados: {len(repositorios)}')
salvar_csv(repositorios)

# Gerar o CSV pra uso no TCC

In [7]:

import pandas as pd
import base64
import requests
import csv
import time

## Funções auxiliares

In [13]:
def make_github_api_request(url, params=None, max_retries=5):
    for retry_num in range(max_retries):
        try:
            response = requests.get(url, headers=HEADERS, params=params)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:
                print(f"Rate limit exceeded. Waiting 60 seconds. Retries left: {max_retries - 1 - retry_num}")
                time.sleep(60)
            elif response.status_code == 404:
                print(f"Resource not found at {url}. Skipping.")
                return None
            else:
                print(f"Error making request to {url}: Status code {response.status_code}, Response: {response.text}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}. Retries left: {max_retries - 1 - retry_num}")
            time.sleep(5) # Shorter wait for network errors
    print(f"Failed to make request to {url} after {max_retries} retries.")
    return None

In [14]:
def get_repo_details(repo_full_name):
    owner, repo = repo_full_name.split('/')
    url = f'https://api.github.com/repos/{owner}/{repo}'
    repo_data = make_github_api_request(url)
    if repo_data:
        return {
            'default_branch': repo_data.get('default_branch'),
            'stargazers_count': repo_data.get('stargazers_count')
        }
    return None

In [15]:
def get_file_tree(repo_full_name, sha):
    owner, repo = repo_full_name.split('/')
    url = f'https://api.github.com/repos/{owner}/{repo}/git/trees/{sha}?recursive=1'
    tree_data = make_github_api_request(url)
    if tree_data and 'tree' in tree_data:
        non_markdown_files = []
        markdown_extensions = ('.md', '.markdown')
        for item in tree_data['tree']:
            # Return path and sha for non-markdown blob files
            if item['type'] == 'blob' and not item['path'].lower().endswith(markdown_extensions):
                non_markdown_files.append({'path': item['path'], 'sha': item['sha']})
        return non_markdown_files
    return []

In [16]:
def get_file_content(repo_full_name, file_sha):
    owner, repo = repo_full_name.split('/')
    url = f'https://api.github.com/repos/{owner}/{repo}/git/blobs/{file_sha}'

    blob_data = make_github_api_request(url)

    if blob_data:
        content_encoding = blob_data.get('encoding')
        content_data = blob_data.get('content')

        if content_data is None:
            print(f"No content data found for SHA {file_sha} in {repo_full_name}. Skipping.")
            return None

        if content_encoding == 'base64':
            try:
                # Base64 decode, then try UTF-8 decode
                decoded_bytes = base64.b64decode(content_data)
                decoded_content = decoded_bytes.decode('utf-8')
                return decoded_content
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError for SHA {file_sha} in {repo_full_name}. Content appears binary or non-UTF8. Skipping.")
                return None
            except Exception as e:
                print(f"Error decoding base64 content for SHA {file_sha} in {repo_full_name}: {e}. Skipping.")
                return None
        elif content_encoding == 'utf-8':
            # Content is already UTF-8 encoded string
            return content_data
        else:
            print(f"Unknown or unsupported encoding '{content_encoding}' for SHA {file_sha} in {repo_full_name}. Skipping.")
            return None
    elif blob_data is None:
        # make_github_api_request already printed error/skip message
        return None
    else:
        print(f"Could not get blob data for SHA {file_sha} in {repo_full_name}. Skipping.")
        return None

In [17]:
def count_lines_of_code(content):
    if content is None:
        return 0
    return len(content.splitlines())

## Obter os dados e salvar no csv

In [27]:
df_repos = pd.read_csv('repositorios_k6.csv')
unique_repos = df_repos[['repositório', 'url']].drop_duplicates().to_dict('records')

In [None]:
repo_data = [] # Reset repo_data for processing all unique repositories

for repo_info in unique_repos: # Iterate through ALL unique repositories
# for repo_info in unique_repos[:2]: # Analisa so 2
    repo_full_name = repo_info['repositório']
    repo_url = repo_info['url']
    owner, repo_name_only = repo_full_name.split('/')

    print(f"Processing repository: {repo_full_name}")

    details = get_repo_details(repo_full_name)
    if not details:
        print(f"Could not get details for {repo_full_name}. Skipping.")
        continue

    default_branch = details['default_branch']
    stargazers_count = details['stargazers_count']

    # Get the SHA of the default branch's head commit
    branch_url = f'https://api.github.com/repos/{owner}/{repo_name_only}/branches/{default_branch}'
    branch_data = make_github_api_request(branch_url)
    if not branch_data or 'commit' not in branch_data or 'sha' not in branch_data['commit']:
        print(f"Could not get default branch SHA for {repo_full_name}. Skipping.")
        continue
    default_branch_sha = branch_data['commit']['sha']

    # get_file_tree now returns dictionaries with 'path' and 'sha'
    non_markdown_files = get_file_tree(repo_full_name, default_branch_sha)
    if not non_markdown_files:
        print(f"No non-markdown files found or could not get file tree for {repo_full_name}. Skipping LOC counting.")

    total_loc = 0
    for file_item in non_markdown_files:
        # Pass file_sha to get_file_content
        content = get_file_content(repo_full_name, file_item['sha'])
        if content is not None:
            total_loc += count_lines_of_code(content)

    repo_data.append({
        'repositório': repo_full_name,
        'url': repo_url,
        'stargazers_count': stargazers_count,
        'total_loc': total_loc
    })

print(f"Finished processing all repositories. Collected data for {len(repo_data)} repositories.")

Processing repository: HariSekhon/Templates
Processing repository: grafana/setup-k6-action
UnicodeDecodeError for SHA 2a0941e524719b003da742ddee916477da4446a8 in grafana/setup-k6-action. Content appears binary or non-UTF8. Skipping.
Processing repository: gzydong/go-chat
Processing repository: hari-p8-io/RestVsGrpc
Processing repository: grafana/k6-jslib-aws
Processing repository: CintyaAprilaFandini/PPL_KUIS-1
UnicodeDecodeError for SHA 8dd8f57e57030c02f261a1b8b1ef31d3973a4a71 in CintyaAprilaFandini/PPL_KUIS-1. Content appears binary or non-UTF8. Skipping.
UnicodeDecodeError for SHA c45cced76d0fca7dc3213effa805fe94ea7fb6e6 in CintyaAprilaFandini/PPL_KUIS-1. Content appears binary or non-UTF8. Skipping.
UnicodeDecodeError for SHA 0c4fd17dfafb7e3bfba40095065f75dd60902230 in CintyaAprilaFandini/PPL_KUIS-1. Content appears binary or non-UTF8. Skipping.
UnicodeDecodeError for SHA d4806ff726c0a09813a12fbef4eaf5653bd4351d in CintyaAprilaFandini/PPL_KUIS-1. Content appears binary or non-UTF8.

In [None]:
df_processed_repos_limited = pd.DataFrame(repo_data)
df_processed_repos_limited = df_processed_repos_limited.rename(columns={'repositório': 'repo', 'total_loc': 'loc', 'stargazers_count': 'stars', 'url': 'url'})

df_processed_repos_limited.to_csv('processed_k6_repos.csv', index=False)

print("Processed data saved to processed_k6_repos.csv")
print(df_processed_repos_limited.head())

Processed data saved to processed_k6_repos.csv
                      repo                                         url  stars  \
0     HariSekhon/Templates     https://github.com/HariSekhon/Templates    164   
1  grafana/setup-k6-action  https://github.com/grafana/setup-k6-action     18   

     loc  
0  14182  
1  39109  
