In [None]:
# Install all necessary libraries, update if necessary
!pip install requests --upgrade --quiet
!pip install beautifulsoup4 --upgrade --quiet
!pip install pandas --upgrade --quiet
!pip install selenium --upgrade --quiet

In [2]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [3]:
topics_url = 'https://github.com/topics' # Website we are scraping
base_url = 'https://github.com' # Base URL we will add website extention to

In [4]:
driver = webdriver.Safari() # Initialize Safari Driver

In [5]:
try:
    driver.get(topics_url)
    
    for i in range(5):
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//button[contains(@class, 'ajax-pagination-btn')]"))
        )
        
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
        # Wait until the button is clickable
        button_element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ajax-pagination-btn')]"))
        )
        
        button_element.click()
    
        time.sleep(1)


    page_contents = driver.page_source

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()  # Always close the driver

In [6]:
# Parse page contents using the html parser in Beautiful Soup
doc = BeautifulSoup(page_contents, 'html.parser')

In [7]:
def parse_star_count(stars_str):
    """ Parses a star count string and converts it to an integer.

    This function takes a string representing a star count, which may include a 'k' suffix
    to denote thousands (e.g., .145k'). If the 'k' is present, it converts the numeric
    portion to a float, multiplies by 1000, and returns the integer value. If no 'k' is
    present, it directly converts the string to an integer. 

    Args: 
        stars_str (str): The star count as a string (e.g., '145k', '300').

    Returns:
        int: The numerical value of the star count (e.g., 145000, 300). 
    """
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1]) * 1000)
    return int(stars_str)

In [8]:
def get_topic_page(topic_url):
    """ Fetches and parses the HTML document of a GitHub topic page.

    Args: 
        topic_url (str): The URL of the GitHub topic page.  

    Returns:
        BeautifulSoup: Parsed HTML document of the topic page.

    Raises: 
        Exception: If the page fails to load. 
    """
    # Downlaod the page 
    response = requests.get(topic_url)
    
    # Check successful response
    if response.status_code != 200:
        raise Exception(f'Failed to load page {topic_url}')
        
    # Parse using Beautiful Soup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc

In [9]:
def get_repo_info(h1_tag, star_tag):
    """ Extracts repository information from the topic page.

    Args:
        h1_tag (BeautifulSoup tag): The h1 tag that contains the repository details. 
        star_tag (BeautifulSoup tag): The span tag that contains the star count.

    Returns:
        tuple: (username, repository name, star count, repository URL).
    """
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    
    return username, repo_name, stars, repo_url

In [10]:
def get_topic_repos(topic_doc):
    """ Extracts repository details from a GitHub topic page. 

    Args:
        topic_doc (BeautifulSoup): Parsed the HTML document of the topic page.

    Returns:
        pd.DataFrame: A DataFrame containing repository details (username, name, stars, URL).
    """
    # Get the h1 tags containing repo title, repo URL, and username
    h1_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3', {'class': h1_selection_class})
    
    # Get star tags
    star_tags = topic_doc.find_all('span', {'class': 'Counter js-social-count'})

    topic_repos_dict = {
        'username' : [],
        'repo_name' : [],
        'stars': [],
        'repo_url' : []
    }
    
    # Get repo info
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])

    return pd.DataFrame(topic_repos_dict)

In [11]:
def scrape_topic(topic_url, path):
    """ Scrapes repositories from a GitHub topic page and saves them as a .csv file. 

    Args:
    topic_url (str): The URL of the GitHub topic page. 
    path (str): The file path to save the scraped data. 

    Returns:
        None
    """
    if os.path.exists(path):
        print(f"The file {path} already exists. Skipping...")
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    
    topic_df.to_csv(path, index=None)

In [12]:
def get_topic_titles(doc):
    """ Extracts topic titles from the GitHub topics page.

    Args:
        doc (BeautifulSoup): Parsed HTML document of the topics page. 

    Returns:
        list: A list of topic titles.
    """
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p', {'class' : selection_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

In [13]:
def get_topic_descs(doc):
    """ Extracts the topic descriptions from the GitHub topics page.

    Args:
        doc (BeautifulSoup): Parsed HTML document of the topics page.
        
    Returns:
        list: A list of topic descriptions. 
    """
    desc_selector = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = doc.find_all('p', {'class', desc_selector})
    
    topic_descs = []
    for tag in topic_desc_tags:
        topic_descs.append(tag.text.strip())
    return topic_descs

In [14]:
def get_topic_urls(doc):
    """ Extracts topic URLs from the GitHub topics page. 

    Args:
        doc (BeautifulSoup): Parsed HTML document of the topics page.

    Returns:
        list: A list of GitHub topic URLs.
    """
    topic_link_tags = doc.find_all('a', {'class': 'no-underline flex-1 d-flex flex-column'})
    topic_urls = []
    base_url = 'https://github.com'
    
    for tag in topic_link_tags:
        topic_urls.append(base_url + tag['href'])
        
    return topic_urls

In [15]:
def scrape_topics():
    """ Scrapes all GitHub topics and their metadata from the topics page.

    Args:
        None

    Returns:
        pd.DataFrame: A DataFrame containing topic titles, descriptions, and URLs. 

    Raises:
        Exception: If the topics page fails to load. 
    """
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception(f'Failed to load page {topic_url}')
    topics_dict = {
        'title': get_topic_titles(doc), 
        'description': get_topic_descs(doc),
        'url': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)

In [16]:
def scrape_topics_repos():
    """ Scrapes repositories from all topics on GitHub and saves them as .csv files. 

    Args:
        None

    Returns:
        None
    """
    print('Scraping list of topics:')
    topics_df = scrape_topics()

    os.makedirs('data_v3', exist_ok=True)
    
    for index, row in topics_df.iterrows():
        print(f'Scraping top repos for "{row['title']}"')
        scrape_topic(row['url'], 'data_v3/{}.csv'.format(row['title']))

In [17]:
# Starts the web scraping process for all topics and repositories 
scrape_topics_repos()

Scraping list of topics:
Scraping top repos for "3D"
The file data_v3/3D.csv already exists. Skipping...
Scraping top repos for "Ajax"
The file data_v3/Ajax.csv already exists. Skipping...
Scraping top repos for "Algorithm"
The file data_v3/Algorithm.csv already exists. Skipping...
Scraping top repos for "Amp"
The file data_v3/Amp.csv already exists. Skipping...
Scraping top repos for "Android"
The file data_v3/Android.csv already exists. Skipping...
Scraping top repos for "Angular"
The file data_v3/Angular.csv already exists. Skipping...
Scraping top repos for "Ansible"
The file data_v3/Ansible.csv already exists. Skipping...
Scraping top repos for "API"
The file data_v3/API.csv already exists. Skipping...
Scraping top repos for "Arduino"
The file data_v3/Arduino.csv already exists. Skipping...
Scraping top repos for "ASP.NET"
Scraping top repos for "Awesome Lists"
Scraping top repos for "Amazon Web Services"
Scraping top repos for "Azure"
Scraping top repos for "Babel"
Scraping top r