In [1]:
import requests
import os
# https://docs.github.com/en/rest?apiVersion=2022-11-28
# get the current working directory
os.getcwd()

'/Users/tnathu-ai/VSCode/Government-Digital-Transformation/notebooks/scrape'

In [2]:
import sys
import os

# Add the directory containing config.py to the Python path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import EXCEL_KEYWORDS_LIST, GITHUB_RAW_DATA_DIR


# Refractor code

In [4]:
import os
import requests
import pandas as pd
import time

def fetch_and_save_github_repos(search_keywords, save_directory):
    query = ' OR '.join(search_keywords) + ' location:Australia'
    params = {
        'q': query,
        'sort': 'stars',  # Sort by stars to get popular repositories first
        'order': 'desc',
    }
    headers = {
        'Accept': 'application/vnd.github.v3+json',
    }

    try:
        response = requests.get('https://api.github.com/search/repositories', headers=headers, params=params)
        print(f"Initial response status: {response.status_code}")  # Print the initial response status

        if response.status_code == 403:  # HTTP 403 for rate limit exceeded
            print("Rate limit exceeded. Waiting 60 seconds...")
            time.sleep(60)
            response = requests.get('https://api.github.com/search/repositories', headers=headers, params=params)
            print(f"Response status after waiting: {response.status_code}")  # Print the response status after waiting

        response.raise_for_status()  # This will raise an exception for HTTP errors
        data = response.json()
        repos = []

        for item in data['items']:
            print(f"Processing repository: {item['name']}")  # Print the name of the repository being processed
            repos.append({
                "Repository Name": item['name'],
                "Description": item.get('description'),
                "Tags": ', '.join(item.get('topics', [])),
                "URL": item['html_url'],
                "Stars": item['stargazers_count'],
                "Forks": item['forks_count'],
                "Open Issues": item['open_issues_count'],
                "Language": item.get('language', 'Not specified'),
                "Date Published": item['created_at'],
                "Last Updated": item['updated_at'],
                "Latest Release": item.get('latest_release', 'Not specified')
            })

        if repos:
            df = pd.DataFrame(repos)
            file_name = f"{query.replace(' ', '_').replace(':', '_').replace('|', '_')}_{params['sort']}_github_repos.csv"
            # Ensure the directory exists
            os.makedirs(save_directory, exist_ok=True)
            save_path = os.path.join(save_directory, file_name)
            df.to_csv(save_path, index=False)
            print(f"Exported to {save_path}")
        else:
            print("No repositories found.")

    except requests.RequestException as e:
        print(f"Request failed: {e}")
    except requests.HTTPError as e:
        print(f"HTTP error occurred: {e}")  # HTTP error occurred
    except Exception as e:
        print(f"An unexpected error occurred: {e}")




In [5]:
%%time

keywords = ['Australian government apps', 'Government Azure', 'COVID', 'crime', 'bushfire']
fetch_and_save_github_repos(keywords, GITHUB_RAW_DATA_DIR)

Initial response status: 200
Processing repository: Horizon-dashboard
Processing repository: AustralianRainfallPrediction
Processing repository: Road-Accident-Analysis
Processing repository: Ruritania
Processing repository: -Taxi-Market-
Exported to ../../data/raw/github/Australian_government_apps_OR_Government_Azure_OR_COVID_OR_crime_OR_bushfire_location_Australia_stars_github_repos.csv
CPU times: user 55.5 ms, sys: 4.03 ms, total: 59.6 ms
Wall time: 124 ms


# Scrape in batch

+ **Load Keywords from Excel:** The script starts by defining a function load_keywords that reads the 'Keywords' column from the specified Excel file path and returns these keywords as a list.
+ **Fetch and Save GitHub Repositories:** The fetch_and_save_github_repos function is updated to loop through each keyword. For each keyword, it builds a search query specifically for repositories in Australia.
+ **Error Handling:** It handles rate limiting by waiting 60 seconds when a 403 status code is received and continues with retries.
+ **Save Data:** For each keyword, the fetched data is saved as a CSV file in a specified directory.


In [2]:
import os
import requests
import pandas as pd
import time

# Load keywords from Excel file
def load_keywords(file_path):
    df = pd.read_excel(file_path)
    return df['Keywords'].tolist()

def fetch_and_save_github_repos(search_keywords, save_directory):
    for keyword in search_keywords:
        query = keyword + ' location:Australia'
        params = {
            'q': query,
            'sort': 'stars',  # Sort by stars to get popular repositories first
            'order': 'desc',
        }
        headers = {
            'Accept': 'application/vnd.github.v3+json',
        }

        try:
            response = requests.get('https://api.github.com/search/repositories', headers=headers, params=params)
            if response.status_code == 403:  # HTTP 403 for rate limit exceeded
                print("Rate limit exceeded. Waiting 60 seconds...")
                time.sleep(60)
                response = requests.get('https://api.github.com/search/repositories', headers=headers, params=params)

            response.raise_for_status()  # This will raise an exception for HTTP errors
            data = response.json()
            repos = []

            for item in data['items']:
                repos.append({
                    "Repository Name": item['name'],
                    "Description": item.get('description'),
                    "Tags": ', '.join(item.get('topics', [])),
                    "URL": item['html_url'],
                    "Stars": item['stargazers_count'],
                    "Forks": item['forks_count'],
                    "Open Issues": item['open_issues_count'],
                    "Language": item.get('language', 'Not specified'),
                    "Date Published": item['created_at'],
                    "Last Updated": item['updated_at'],
                    "Latest Release": item.get('latest_release', 'Not specified')
                })

            if repos:
                df = pd.DataFrame(repos)
                file_name = f"{keyword.replace(' ', '_').replace(':', '_').replace('|', '_')}_{params['sort']}_github_repos.csv"
                os.makedirs(save_directory, exist_ok=True)
                save_path = os.path.join(save_directory, file_name)
                df.to_csv(save_path, index=False)
                print(f"Exported {len(repos)} repositories to {save_path}")
            else:
                print(f"No repositories found for keyword: {keyword}")

        except requests.RequestException as e:
            print(f"Request failed for keyword: {keyword}, {e}")
        except requests.HTTPError as e:
            print(f"HTTP error occurred for keyword: {keyword}, {e}")  # HTTP error occurred
        except Exception as e:
            print(f"An unexpected error occurred for keyword: {keyword}, {e}")




In [3]:
%%time
# Path to the Excel file with keywords
excel_path = EXCEL_KEYWORDS_LIST
keywords = load_keywords(excel_path)
fetch_and_save_github_repos(keywords, GITHUB_RAW_DATA_DIR)

No repositories found for keyword: Australia Digital Transformation
No repositories found for keyword: Australia Digital Transformation
Exported 5 repositories to ../../data/raw/github/Australia_Government__stars_github_repos.csv
No repositories found for keyword: Australia Digital Identity 
No repositories found for keyword: Parliament of Australia
No repositories found for keyword: Centrelink
No repositories found for keyword: Australian Medicare 
No repositories found for keyword: Service New South Wales
No repositories found for keyword: MyGov Australia
No repositories found for keyword: JobSeeker Australia
Rate limit exceeded. Waiting 60 seconds...
No repositories found for keyword: NDIS Australia
Exported 1 repositories to ../../data/raw/github/Home_Affairs_Australia_stars_github_repos.csv
No repositories found for keyword: Smart Cities Australia
Exported 1 repositories to ../../data/raw/github/Department_of_Health__Australia_stars_github_repos.csv
No repositories found for keywo