In [1]:
%pip install pandas requests dotenv

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')

if not GITHUB_TOKEN:
    raise RuntimeError("GITHUB_TOKEN não encontrado no .env. Adicione uma linha: GITHUB_TOKEN=seu_token")

HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}

# QUERY = r'/import\s+(?:\w+|\* as\s*\w+|\{[^}]+\})\s+from\s+[\'"`]k6[^\'"`]*[\'"`]/' + ' AND (language:JavaScript OR language:TypeScript) '
# QUERY = 'import AND from AND ("k6 OR \'k6) AND (language:JavaScript OR language:TypeScript)'
QUERY = 'import \"k6\" OR import \'k6\' language:JavaScript OR language:TypeScript'

# Gerar CSV pra usar na analise dos repositorios

In [3]:
import requests
import csv
import time

# Função para buscar repositórios, agora com debug e URL encoding
def buscar_repositorios(paginas=5, query = QUERY):
    repositorios = {}
    url = 'https://api.github.com/search/code'

    for page in range(1, paginas + 1):
        params = {'q': query, 'per_page': 100, 'page': page}
        print(f'Buscando página {page} com query: {params["q"]}')  # DEBUG
        response = requests.get(url, headers=HEADERS, params=params)
        print(f'Status code: {response.status_code}')  # DEBUG

        if response.status_code == 200:
            data = response.json()
            if 'items' not in data or len(data['items']) == 0:
                print(f'Nenhum resultado encontrado na página {page}. Encerrando busca.')
                break
            for item in data['items']:
                repo_name = item['repository']['full_name']
                repo_url = item['repository']['html_url']
                file_path = item['path']
                if repo_name not in repositorios:
                    repositorios[repo_name] = {'url': repo_url, 'arquivos': []}
                repositorios[repo_name]['arquivos'].append(file_path)
        elif response.status_code == 403:
            print(f'Limite de taxa atingido. Aguardando 60 segundos.')
            time.sleep(60)
            page -= 1  # Retry current page
            continue
        elif response.status_code == 422:
            print(f'Erro 422 (Unprocessable Entity) na página {page}. Isso geralmente indica que o limite de 1000 resultados da API de busca do GitHub foi atingido ou a consulta é inválida. Encerrando busca.')
            break  # Stop if we hit this error
        else:
            print(f'Erro inesperado na requisição na página {page}: {response.status_code}. Encerrando busca.')
            break

    return repositorios

# Função para salvar CSV permanece igual
def salvar_csv(repositorios, arquivo_saida='repositorios_k6.csv'):
    with open(arquivo_saida, mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['repositório', 'url', 'arquivos']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for repo_name, info in repositorios.items():
            writer.writerow({'repositório': repo_name, 'url': info['url'], 'arquivos': '; '.join(info['arquivos'])})
    print(f'Resultados salvos em {arquivo_saida}')


repositorios = buscar_repositorios(paginas=5)
print(f'Total de repositórios encontrados: {len(repositorios)}')
salvar_csv(repositorios, 'output/repositorios_k6.csv')

Buscando página 1 com query: import "k6" OR import 'k6' language:JavaScript OR language:TypeScript
Status code: 200
Buscando página 2 com query: import "k6" OR import 'k6' language:JavaScript OR language:TypeScript
Status code: 200
Buscando página 3 com query: import "k6" OR import 'k6' language:JavaScript OR language:TypeScript
Status code: 200
Buscando página 4 com query: import "k6" OR import 'k6' language:JavaScript OR language:TypeScript
Status code: 200
Buscando página 5 com query: import "k6" OR import 'k6' language:JavaScript OR language:TypeScript
Status code: 200
Total de repositórios encontrados: 433
Resultados salvos em output/repositorios_k6.csv


In [4]:
names = repositorios.keys()
for name in names:
    files = buscar_repositorios(paginas=5, query=QUERY + ' repo:'+name)
    print(files)
    if not files or name not in files:
        print(f"Ignorando {name}")
        continue
    new_files = files[name]['arquivos']
    previous_files = repositorios[name]['arquivos']
    should_break = False
    # print(new_files)
    for new_file in new_files:
        if new_file not in previous_files:
            print('novo arquivo')
            print(new_file)
            print(repositorios[name])
            repositorios[name]['arquivos'].append(new_file)
            print(repositorios[name])
    #         should_break = True
    #         break
    # if should_break:
    #     break
# names = repositorios['repositório']
# names.head()
salvar_csv(repositorios, 'output/repositorios_k6.csv')


Buscando página 1 com query: import "k6" OR import 'k6' language:JavaScript OR language:TypeScript repo:HariSekhon/Templates
Status code: 200
Buscando página 2 com query: import "k6" OR import 'k6' language:JavaScript OR language:TypeScript repo:HariSekhon/Templates
Status code: 200
Nenhum resultado encontrado na página 2. Encerrando busca.
{'HariSekhon/Templates': {'url': 'https://github.com/HariSekhon/Templates', 'arquivos': ['k6.js']}}
Buscando página 1 com query: import "k6" OR import 'k6' language:JavaScript OR language:TypeScript repo:grafana/setup-k6-action
Status code: 200
Buscando página 2 com query: import "k6" OR import 'k6' language:JavaScript OR language:TypeScript repo:grafana/setup-k6-action
Status code: 200
Nenhum resultado encontrado na página 2. Encerrando busca.
{'grafana/setup-k6-action': {'url': 'https://github.com/grafana/setup-k6-action', 'arquivos': ['src/k6.ts', 'dev/protocol.js']}}
novo arquivo
dev/protocol.js
{'url': 'https://github.com/grafana/setup-k6-actio

# Gerar o CSV pra uso no TCC

In [3]:

import pandas as pd
import base64
import requests
import csv
import time

## Funções auxiliares

In [4]:
def make_github_api_request(url, params=None, max_retries=5):
    for retry_num in range(max_retries):
        try:
            response = requests.get(url, headers=HEADERS, params=params)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:
                print(f"Rate limit exceeded. Waiting 60 seconds. Retries left: {max_retries - 1 - retry_num}")
                time.sleep(60)
            elif response.status_code == 404:
                print(f"Resource not found at {url}. Skipping.")
                return None
            else:
                print(f"Error making request to {url}: Status code {response.status_code}, Response: {response.text}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}. Retries left: {max_retries - 1 - retry_num}")
            time.sleep(5) # Shorter wait for network errors
    print(f"Failed to make request to {url} after {max_retries} retries.")
    return None

In [5]:
def get_repo_details(repo_full_name):
    owner, repo = repo_full_name.split('/')
    url = f'https://api.github.com/repos/{owner}/{repo}'
    repo_data = make_github_api_request(url)
    if repo_data:
        return {
            'default_branch': repo_data.get('default_branch'),
            'stargazers_count': repo_data.get('stargazers_count')
        }
    return None

In [6]:
def get_file_tree(repo_full_name, sha):
    owner, repo = repo_full_name.split('/')
    url = f'https://api.github.com/repos/{owner}/{repo}/git/trees/{sha}?recursive=1'
    tree_data = make_github_api_request(url)
    if tree_data and 'tree' in tree_data:
        non_markdown_files = []
        markdown_extensions = ('.md', '.markdown')
        for item in tree_data['tree']:
            # Return path and sha for non-markdown blob files
            if item['type'] == 'blob' and not item['path'].lower().endswith(markdown_extensions):
                non_markdown_files.append({'path': item['path'], 'sha': item['sha']})
        return non_markdown_files
    return []

In [7]:
def get_file_content(repo_full_name, file_sha):
    owner, repo = repo_full_name.split('/')
    url = f'https://api.github.com/repos/{owner}/{repo}/git/blobs/{file_sha}'

    blob_data = make_github_api_request(url)

    if blob_data:
        content_encoding = blob_data.get('encoding')
        content_data = blob_data.get('content')

        if content_data is None:
            print(f"No content data found for SHA {file_sha} in {repo_full_name}. Skipping.")
            return None

        if content_encoding == 'base64':
            try:
                # Base64 decode, then try UTF-8 decode
                decoded_bytes = base64.b64decode(content_data)
                decoded_content = decoded_bytes.decode('utf-8')
                return decoded_content
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError for SHA {file_sha} in {repo_full_name}. Content appears binary or non-UTF8. Skipping.")
                return None
            except Exception as e:
                print(f"Error decoding base64 content for SHA {file_sha} in {repo_full_name}: {e}. Skipping.")
                return None
        elif content_encoding == 'utf-8':
            # Content is already UTF-8 encoded string
            return content_data
        else:
            print(f"Unknown or unsupported encoding '{content_encoding}' for SHA {file_sha} in {repo_full_name}. Skipping.")
            return None
    elif blob_data is None:
        # make_github_api_request already printed error/skip message
        return None
    else:
        print(f"Could not get blob data for SHA {file_sha} in {repo_full_name}. Skipping.")
        return None

In [8]:
def count_lines_of_code(content):
    if content is None:
        return 0
    return len(content.splitlines())

## Obter os dados e salvar no csv

In [9]:
df_repos = pd.read_csv('output/repositorios_k6.csv')
unique_repos = df_repos[['repositório', 'url']].drop_duplicates().to_dict('records')
df_repos.shape

(433, 3)

In [10]:
# Parallelized cloning: clones multiple repos (shallow), counts lines, then removes clones
import tempfile
import shutil
import subprocess
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuration - ajuste conforme disponibilidade de rede / disco
MAX_WORKERS = 8  # número de clones paralelos
CLONE_TIMEOUT = 60*10  # segundos por clone

repo_data = []

def count_lines_in_tree(path):
    """Conta linhas em todos os arquivos do diretório `path`, ignorando arquivos binários e .git."""
    total = 0
    for root, dirs, files in os.walk(path):
        # evitar entrar em .git
        dirs[:] = [d for d in dirs if d != '.git']
        for fname in files:
            fpath = os.path.join(root, fname)
            try:
                # rápido teste binário: se contém null byte nos primeiros 1KB, pule
                with open(fpath, 'rb') as fh:
                    head = fh.read(1024)
                    if b'\x00' in head:
                        continue
                # contar quebras de linha em modo binário (eficiente)
                count = 0
                with open(fpath, 'rb') as fh:
                    for chunk in iter(lambda: fh.read(8192), b''):
                        count += chunk.count(b'\n')
                total += count
            except Exception:
                # ignora arquivos que não podem ser lidos
                continue
    return total


def process_repo(repo_info):
    """Task para clonar um repositório via SSH, contar linhas e remover o clone."""
    repo_full_name = repo_info['repositório'] if isinstance(repo_info, dict) else repo_info
    repo_url = repo_info.get('url') if isinstance(repo_info, dict) else None
    owner, repo_name_only = repo_full_name.split('/')

    result = {
        'repositório': repo_full_name,
        'url': repo_url,
        'stargazers_count': 0,
        'total_loc': 0,
    }

    details = get_repo_details(repo_full_name)
    if not details:
        print(f"Could not get details for {repo_full_name}. Skipping.")
        return None
    result['stargazers_count'] = details.get('stargazers_count', 0)

    # SSH clone URL (requer chave SSH configurada)
    clone_url = f'git@github.com:{owner}/{repo_name_only}.git'

    tempdir = tempfile.mkdtemp(prefix='repo_clone_')
    try:
        # clonar shallow
        subprocess.run(['git', 'clone', '--depth', '1', clone_url, tempdir], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=CLONE_TIMEOUT)
    except subprocess.TimeoutExpired:
        print(f"Clone timed out for {repo_full_name}. Skipping.")
        try:
            shutil.rmtree(tempdir)
        except Exception:
            pass
        return None
    except subprocess.CalledProcessError as e:
        # mostrar mensagem curta de erro
        err = None
        try:
            err = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
        except Exception:
            err = str(e)
        print(f"git clone failed for {repo_full_name}: {err[:400]}. Skipping.")
        try:
            shutil.rmtree(tempdir)
        except Exception:
            pass
        return None

    try:
        loc = count_lines_in_tree(tempdir)
        result['total_loc'] = loc
    finally:
        # cleanup sempre
        try:
            shutil.rmtree(tempdir)
        except Exception:
            pass

    return result

# Construir lista de repositórios preservando a URL do CSV
repos_list = []
for r in unique_repos:
    if isinstance(r, dict):
        repos_list.append({'repositório': r['repositório'], 'url': r.get('url')})
    else:
        repos_list.append({'repositório': r, 'url': None})

print(f"Starting parallel processing of {len(repos_list)} repositories with {MAX_WORKERS} workers...")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_repo = {executor.submit(process_repo, repo_info): repo_info['repositório'] for repo_info in repos_list}
    for fut in as_completed(future_to_repo):
        repo_name = future_to_repo[fut]
        try:
            res = fut.result()
            if res:
                print(f"processed {res['repositório']}")
                repo_data.append(res)
        except Exception as exc:
            print(f"Repository {repo_name} generated an exception: {exc}")

print(f"Finished processing all repositories. Collected data for {len(repo_data)} repositories.")

Starting parallel processing of 433 repositories with 8 workers...
processed ymotongpoo/cndf2023
processed grafana/k6-jslib-aws
processed oreyon/nestjs-11-mikro-orm-10
processed grafana/setup-k6-action
processed hari-p8-io/RestVsGrpc
processed HariSekhon/Templates
processed 6-BARK/neighborhood
git clone failed for JohnnyKamigashima/PGATS-k6: Cloning into 'C:\Users\Samuel\AppData\Local\Temp\repo_clone_movyxiqm'...
error: invalid path 'swagger: "2.yml'
fatal: unable to checkout working tree
You can inspect what was checked out with 'git status'
and retry with 'git restore --source=HEAD :/'

. Skipping.
processed gzydong/go-chat
processed pigfox/grpc-vs-rest
processed maxoliverbr/k6GithubActions
processed armydep/shorten-url
processed nikoresu/http-concurrency
processed Hermijj/K6Code
processed JonathanGunawan30/golang-redis-jwt-session
processed chan4lk/k6-browser-perf-tests
processed CintyaAprilaFandini/PPL_KUIS-1
processed bendbennett/go-api-demo
processed Fan-Lo/SENG533
processed akra

In [11]:
df_processed_repos_limited = pd.DataFrame(repo_data)
df_processed_repos_limited = df_processed_repos_limited.rename(columns={'repositório': 'repo', 'total_loc': 'loc', 'stargazers_count': 'stars', 'url': 'url'})

df_processed_repos_limited.to_csv('output/processed_k6_repos.csv', index=False)

print("Processed data saved to processed_k6_repos.csv")
print(df_processed_repos_limited.shape)
df_processed_repos_limited.head()

Processed data saved to processed_k6_repos.csv
(421, 4)


Unnamed: 0,repo,url,stars,loc
0,ymotongpoo/cndf2023,https://github.com/ymotongpoo/cndf2023,3,886
1,grafana/k6-jslib-aws,https://github.com/grafana/k6-jslib-aws,24,9838
2,oreyon/nestjs-11-mikro-orm-10,https://github.com/oreyon/nestjs-11-mikro-orm-10,0,19955
3,grafana/setup-k6-action,https://github.com/grafana/setup-k6-action,18,39193
4,hari-p8-io/RestVsGrpc,https://github.com/hari-p8-io/RestVsGrpc,0,21572
