In [None]:
!pip install requests --upgrade --quiet
!pip install beautifulsoup4 --upgrade --quiet
!pip install pandas --upgrade --quiet

In [2]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd

In [3]:
topics_url = 'https://github.com/topics'
base_url = 'https://github.com'

In [4]:
response = requests.get(topics_url)

In [5]:
page_contents = response.text

In [6]:
doc = BeautifulSoup(page_contents, 'html.parser')

In [7]:
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1]) * 1000)
    return int(stars_str)

In [8]:
def get_topic_page(topic_url):
    # Downlaod the page 
    response = requests.get(topic_url)
    # Check successful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    # Parse using Beautiful Soup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc

def get_repo_info(h1_tag, star_tag):
    #returns all the required info about a repository
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url

def get_topic_repos(topic_doc):
    # Get the h1 tags containing repo title, repo URL, and username
    h1_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3', {'class': h1_selection_class})
    # Get star tags
    star_tags = topic_doc.find_all('span', {'class': 'Counter js-social-count'})

    topic_repos_dict = {
        'username' : [],
        'repo_name' : [],
        'stars': [],
        'repo_url' : []
    }
    
    # Get repo info
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])

    return pd.DataFrame(topic_repos_dict)

def scrape_topic(topic_url, path):
    if os.path.exists(path):
        print("The file {} already exists. Skipping...".format(path))
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    
    topic_df.to_csv(path, index=None)

In [9]:
def get_topic_titles(doc):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p', {'class' : selection_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

def get_topic_descs(doc):
    desc_selector = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = doc.find_all('p', {'class', desc_selector})
    topic_descs = []
    for tag in topic_desc_tags:
        topic_descs.append(tag.text.strip())
    return topic_descs

def get_topic_urls(doc):
    topic_link_tags = doc.find_all('a', {'class': 'no-underline flex-1 d-flex flex-column'})
    topic_urls = []
    base_url = 'https://github.com'
    for tag in topic_link_tags:
        topic_urls.append(base_url + tag['href'])
    return topic_urls

def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    topics_dict = {
        'title': get_topic_titles(doc), 
        'description': get_topic_descs(doc),
        'url': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)

def scrape_topic(topic_url, topic_name):
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(topic_name + '.csv', index=None)

In [10]:
def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()

    os.makedirs('data', exist_ok=True)
    
    for index, row in topics_df.iterrows():
        print('Scraping top repos for "{}"'.format(row['title']))
        scrape_topic(row['url'], 'data/{}.csv'.format(row['title']))

In [11]:
scrape_topics_repos()

Scraping list of topics
Scraping top repos for "3D"
Scraping top repos for "Ajax"
Scraping top repos for "Algorithm"
Scraping top repos for "Amp"
Scraping top repos for "Android"
Scraping top repos for "Angular"
Scraping top repos for "Ansible"
Scraping top repos for "API"
Scraping top repos for "Arduino"
Scraping top repos for "ASP.NET"
Scraping top repos for "Awesome Lists"
Scraping top repos for "Amazon Web Services"
Scraping top repos for "Azure"
Scraping top repos for "Babel"
Scraping top repos for "Bash"
Scraping top repos for "Bitcoin"
Scraping top repos for "Bootstrap"
Scraping top repos for "Bot"
Scraping top repos for "C"
Scraping top repos for "Chrome"
Scraping top repos for "Chrome extension"
Scraping top repos for "Command-line interface"
Scraping top repos for "Clojure"
Scraping top repos for "Code quality"
Scraping top repos for "Code review"
Scraping top repos for "Compiler"
Scraping top repos for "Continuous integration"
Scraping top repos for "C++"
Scraping top repos 