In [None]:
import requests
import pandas as pd
import time
from datetime import datetime

class GitHubRepoExtractor:
    def __init__(self, token=None):
        """
        Initialize the GitHub repository extractor
        """
        self.token = token
        self.base_url = "https://api.github.com"
        self.headers = {
            'Accept': 'application/vnd.github.v3+json',
            'User-Agent': 'Python-GitHub-Extractor'
        }
        
        if token:
            self.headers['Authorization'] = f'token {token}'
    
    def search_repositories(self, query, sort='stars', order='desc', per_page=100, max_results=1000):

        repositories = []
        page = 1
        
        while len(repositories) < max_results:
            url = f"{self.base_url}/search/repositories"
            params = {
                'q': query,
                'sort': sort,
                'order': order,
                'per_page': min(per_page, max_results - len(repositories)),
                'page': page
            }
            
            try:
                response = requests.get(url, headers=self.headers, params=params)
                
                if response.status_code == 403:
                    print("Rate limit reached. Waiting...")
                    time.sleep(60)
                    continue
                
                response.raise_for_status()
                data = response.json()
                
                if not data['items']:
                    break
                
                repositories.extend(data['items'])
                
                print(f"Obtained {len(repositories)} repositories...")
                
                if not self.token:
                    time.sleep(1)
                
                page += 1
                
            except requests.exceptions.RequestException as e:
                print(f"Error: {e}")
                break
        
        return repositories[:max_results]
    

    def extract_repo_data(self, repo):
        """
        Extract relevant data from a repository
        """
        return {
            'name': repo.get('name'),
            'full_name': repo.get('full_name'),
            'description': repo.get('description'),
            'url': repo.get('html_url'),
            'language': repo.get('language'),
            'stars': repo.get('stargazers_count', 0),
            'forks': repo.get('forks_count', 0),
            'watchers': repo.get('watchers_count', 0),
            'size': repo.get('size', 0),
            'created_at': repo.get('created_at'),
            'updated_at': repo.get('updated_at'),
            'topics': repo.get('topics', []),
            'license': repo.get('license', {}).get('name') if repo.get('license') else None,
            'owner': repo.get('owner', {}).get('login'),
            'owner_type': repo.get('owner', {}).get('type'),
            'is_fork': repo.get('fork', False),
            'has_issues': repo.get('has_issues', False),
            'has_wiki': repo.get('has_wiki', False),
            'has_pages': repo.get('has_pages', False),
        }
    
    
    def save_to_csv(self, repositories, filename=None):
        """
        Save repositories to a CSV file
        """
        if not filename:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"../data/github_repositories_{timestamp}.csv"
        
        processed_repos = [self.extract_repo_data(repo) for repo in repositories]
        
        df = pd.DataFrame(processed_repos)
        
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Data saved in {filename}")
        
        return df


Buscando repositorios de citizen science...
Obtenidos 100 repositorios...
Obtenidos 200 repositorios...
Obtenidos 300 repositorios...
Obtenidos 400 repositorios...
Obtenidos 500 repositorios...
Obtenidos 600 repositorios...
Obtenidos 700 repositorios...
Obtenidos 800 repositorios...
Obtenidos 900 repositorios...
Obtenidos 918 repositorios...
Obtenidos 1000 repositorios...
Encontrados 1000 repositorios
Datos guardados en github_repositories_20250616_155622.csv

=== ESTADÍSTICAS ===
Total de repositorios: 1000


In [None]:
# Initialize extractor
extractor = GitHubRepoExtractor()  # token="tu_token_aqui"
    
# Search repositories related to "citizen science"
print("Buscando repositorios de citizen science...")
repositories = extractor.search_repositories(
    query="citizen science",
    sort="stars",
    order="desc",
    max_results=1000
)

print(f"Encontrados {len(repositories)} repositorios")

# Save to CSV
df = extractor.save_to_csv(repositories)

# Display basic statistics
print(f"Total number of repositories: {len(df)}")

In [None]:
# Remove duplicates and save final csv

df = pd.read_csv("github_repositories_20250616_155622.csv")

df.drop_duplicates().to_csv("github_repositories.csv", index=False)