ragilmalik · ragilmalik · Nov 19, 2025 · Nov 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,39 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Data files (optional - uncomment if you want to track them)
+# repos_data.json
+# repos_categorized.json
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
diff --git a/README.md b/README.md
diff --git a/USAGE.md b/USAGE.md
@@ -0,0 +1,124 @@
+# Usage Guide
+
+## How to Update the Collection
+
+This repository uses Python scripts to fetch and organize Python repositories from GitHub.
+
+### Prerequisites
+
+```bash
+pip install -r requirements.txt
+```
+
+### Step 1: Fetch Repository Data
+
+Run the data collection script:
+
+```bash
+python3 fetch_repos.py
+```
+
+This will:
+- Fetch the top 1000 Python repositories from GitHub
+- Categorize them by topic
+- Save data to `repos_data.json` and `repos_categorized.json`
+
+**Optional:** Set a GitHub token to increase API rate limits:
+
+```bash
+export GITHUB_TOKEN="your_github_token_here"
+python3 fetch_repos.py
+```
+
+### Step 2: Generate README
+
+After fetching the data, generate the README:
+
+```bash
+python3 generate_readme.py
+```
+
+This will create a beautifully formatted `README.md` with:
+- Statistics about the collection
+- Table of contents
+- Categorized repositories with top 10 visible
+- Collapsible sections for additional repositories
+- Proper markdown formatting for GitHub
+
+### Customization
+
+#### Modify Categories
+
+Edit the `CATEGORIES` dictionary in `fetch_repos.py` to add or modify categories:
+
+```python
+CATEGORIES = {
+    'Your Category': ['keyword1', 'keyword2', ...],
+    ...
+}
+```
+
+#### Change Display Count
+
+Modify the `show_top` parameter in `generate_readme.py` to change how many repos are shown before the collapsible section:
+
+```python
+generate_category_section(category, repos, show_top=10)  # Change 10 to your preference
+```
+
+#### Fetch More Repositories
+
+The GitHub API limits results to 1000 per query. To get more repositories, you'll need to modify the script to make multiple queries with different star ranges.
+
+### Automation
+
+You can set up a GitHub Action to automatically update the collection weekly or monthly. Example workflow:
+
+```yaml
+name: Update Collection
+
+on:
+  schedule:
+    - cron: '0 0 * * 0'  # Weekly on Sunday
+  workflow_dispatch:
+
+jobs:
+  update:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+      - run: pip install -r requirements.txt
+      - run: python3 fetch_repos.py
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - run: python3 generate_readme.py
+      - run: |
+          git config user.name github-actions
+          git config user.email github-actions@github.com
+          git add README.md repos_data.json repos_categorized.json
+          git commit -m "Update collection"
+          git push
+```
+
+## File Structure
+
+```
+.
+├── README.md                    # Generated collection (do not edit manually)
+├── USAGE.md                     # This file
+├── fetch_repos.py              # Script to fetch GitHub data
+├── generate_readme.py          # Script to generate README
+├── requirements.txt            # Python dependencies
+├── repos_data.json            # Raw repository data
+└── repos_categorized.json     # Categorized repository data
+```
+
+## Notes
+
+- The collection is sorted by star count (highest to lowest)
+- Each repository may appear in multiple categories if relevant
+- The last update date is shown for each repository
+- GitHub API has rate limits (60 requests/hour without auth, 5000 with token)
diff --git a/fetch_repos.py b/fetch_repos.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""
+GitHub Python Repository Collector
+Fetches top Python repositories from GitHub API and organizes them by category
+"""
+
+import requests
+import json
+import time
+from datetime import datetime
+from typing import List, Dict
+import os
+
+# GitHub API configuration
+GITHUB_API_URL = "https://api.github.com/search/repositories"
+GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '')  # Optional, but increases rate limit
+
+# Categories based on common topics/keywords
+CATEGORIES = {
+    'Web Frameworks': ['django', 'flask', 'fastapi', 'tornado', 'pyramid', 'bottle', 'web-framework', 'aiohttp', 'sanic'],
+    'Machine Learning & AI': ['machine-learning', 'deep-learning', 'neural-network', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'ai', 'artificial-intelligence'],
+    'Data Science & Analytics': ['data-science', 'data-analysis', 'pandas', 'numpy', 'jupyter', 'analytics', 'data-visualization', 'matplotlib'],
+    'DevOps & Automation': ['devops', 'automation', 'ansible', 'docker', 'kubernetes', 'ci-cd', 'deployment', 'infrastructure'],
+    'Testing': ['testing', 'pytest', 'unittest', 'test-automation', 'selenium', 'testing-tools'],
+    'Web Scraping': ['web-scraping', 'scraping', 'crawler', 'beautifulsoup', 'scrapy', 'selenium'],
+    'CLI Tools': ['cli', 'command-line', 'terminal', 'console', 'cli-app'],
+    'API Development': ['api', 'rest-api', 'graphql', 'api-wrapper', 'api-client'],
+    'Security & Cryptography': ['security', 'cryptography', 'penetration-testing', 'cybersecurity', 'encryption'],
+    'Computer Vision': ['computer-vision', 'image-processing', 'opencv', 'image-recognition'],
+    'Natural Language Processing': ['nlp', 'natural-language-processing', 'text-processing', 'language-model'],
+    'Game Development': ['game', 'pygame', 'game-development', 'game-engine'],
+    'Networking': ['networking', 'network', 'socket', 'http', 'tcp', 'udp'],
+    'Database': ['database', 'sql', 'orm', 'sqlalchemy', 'mongodb', 'redis', 'postgresql'],
+    'GUI Applications': ['gui', 'desktop', 'tkinter', 'pyqt', 'wxpython', 'kivy'],
+    'Scientific Computing': ['scientific-computing', 'scipy', 'simulation', 'mathematics', 'physics'],
+    'Audio & Video': ['audio', 'video', 'multimedia', 'music', 'sound'],
+    'Utilities & Tools': ['utility', 'tools', 'helper', 'library'],
+}
+
+def fetch_repos_by_stars(min_stars: int = 1, max_results: int = 1000, per_page: int = 100) -> List[Dict]:
+    """
+    Fetch Python repositories sorted by stars
+    GitHub API allows max 1000 results per search query
+    """
+    headers = {
+        'Accept': 'application/vnd.github.v3+json',
+    }
+    if GITHUB_TOKEN:
+        headers['Authorization'] = f'token {GITHUB_TOKEN}'
+
+    all_repos = []
+    page = 1
+
+    while len(all_repos) < max_results:
+        params = {
+            'q': f'language:python stars:>={min_stars}',
+            'sort': 'stars',
+            'order': 'desc',
+            'per_page': per_page,
+            'page': page
+        }
+
+        print(f"Fetching page {page}...")
+
+        try:
+            response = requests.get(GITHUB_API_URL, headers=headers, params=params)
+
+            if response.status_code == 403:
+                print("Rate limit exceeded. Waiting...")
+                time.sleep(60)
+                continue
+
+            response.raise_for_status()
+            data = response.json()
+
+            items = data.get('items', [])
+            if not items:
+                break
+
+            for repo in items:
+                repo_data = {
+                    'name': repo['name'],
+                    'full_name': repo['full_name'],
+                    'description': repo['description'] or 'No description provided',
+                    'html_url': repo['html_url'],
+                    'stars': repo['stargazers_count'],
+                    'last_updated': repo['updated_at'],
+                    'language': repo['language'],
+                    'topics': repo.get('topics', []),
+                    'homepage': repo.get('homepage', ''),
+                    'license': repo.get('license', {}).get('name', 'N/A') if repo.get('license') else 'N/A',
+                }
+                all_repos.append(repo_data)
+
+            print(f"Collected {len(all_repos)} repositories so far...")
+
+            # Check if we've reached the last page
+            if len(items) < per_page:
+                break
+
+            page += 1
+
+            # Respect rate limits
+            time.sleep(1)
+
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching data: {e}")
+            break
+
+    return all_repos
+
+def categorize_repo(repo: Dict) -> List[str]:
+    """Categorize a repository based on its topics and description"""
+    categories = []
+    repo_topics = [t.lower() for t in repo['topics']]
+    repo_desc = repo['description'].lower()
+    repo_name = repo['name'].lower()
+
+    for category, keywords in CATEGORIES.items():
+        for keyword in keywords:
+            if (keyword in repo_topics or
+                keyword in repo_desc or
+                keyword in repo_name):
+                categories.append(category)
+                break
+
+    # If no category found, put in "Utilities & Tools"
+    if not categories:
+        categories.append('Utilities & Tools')
+
+    return categories
+
+def organize_by_category(repos: List[Dict]) -> Dict[str, List[Dict]]:
+    """Organize repositories by category"""
+    categorized = {cat: [] for cat in CATEGORIES.keys()}
+
+    for repo in repos:
+        categories = categorize_repo(repo)
+        for category in categories:
+            if category in categorized:
+                categorized[category].append(repo)
+
+    # Sort each category by stars
+    for category in categorized:
+        categorized[category].sort(key=lambda x: x['stars'], reverse=True)
+
+    # Remove empty categories
+    categorized = {k: v for k, v in categorized.items() if v}
+
+    return categorized
+
+def save_data(repos: List[Dict], filename: str = 'repos_data.json'):
+    """Save repository data to JSON file"""
+    with open(filename, 'w', encoding='utf-8') as f:
+        json.dump(repos, f, indent=2, ensure_ascii=False)
+    print(f"Data saved to {filename}")
+
+def main():
+    print("Starting GitHub Python Repository Collection...")
+    print("=" * 60)
+
+    # Fetch repositories (GitHub API limits to 1000 results per query)
+    # To get more, we'll need to make multiple queries with different star ranges
+    all_repos = []
+
+    # Fetch top repositories (this will get the most popular ones first)
+    print("\nFetching top Python repositories...")
+    repos = fetch_repos_by_stars(min_stars=1, max_results=1000)
+    all_repos.extend(repos)
+
+    print(f"\nTotal repositories collected: {len(all_repos)}")
+
+    # Save raw data
+    save_data(all_repos, 'repos_data.json')
+
+    # Organize by category
+    print("\nOrganizing repositories by category...")
+    categorized = organize_by_category(all_repos)
+
+    # Save categorized data
+    save_data(categorized, 'repos_categorized.json')
+
+    # Print summary
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    for category, repos in sorted(categorized.items(), key=lambda x: len(x[1]), reverse=True):
+        print(f"{category}: {len(repos)} repositories")
+
+    print("\nData collection complete!")
+
+if __name__ == '__main__':
+    main()