In [3]:
!pip install requests --upgrade --quiet
!pip install beautifulsoup4 --upgrade --quiet
import requests
from bs4 import BeautifulSoup
topic_url="https://github.com/topics"
def get_topic_page():
    
    # Download the topic page
    response=requests.get(topic_url)
    
    # Check successful response
    if response.status_code!=200:
        raise Exception('Failed to load page {}'.format(topic_url))
    
    # Parse using BeautifulSoup
    topic_doc=BeautifulSoup(response.text,'html.parser')
    return topic_doc


In [4]:
doc=get_topic_page()

In [5]:
def get_topic_titles(doc):
    select_p_class='f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags=doc.find_all('p',{'class':select_p_class})
    topic_titles=[]
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

In [6]:
titles=get_topic_titles(doc)

In [7]:
len(titles)

30

In [8]:
titles[:5]

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android']

In [9]:
# to get the description for each title
def get_topic_descs(doc):
    desc_selector='f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags=doc.find_all('p',{'class':desc_selector})
    topic_desc=[]
    for tag in topic_desc_tags:
        topic_desc.append(tag.text.strip())
    return topic_desc

In [10]:
descs=get_topic_descs(doc)

In [11]:
len(descs)

30

In [12]:
descs[:1]

['3D modeling is the process of virtually developing the surface and structure of a 3D object.']

In [13]:
def get_topic_urls(doc):
    topic_link_tags=doc.find_all('a',{'class':'no-underline flex-1 d-flex flex-column'})
    topic_urls=[]
    base_url='https://github.com'
    for tag in topic_link_tags:
        topic_urls.append(base_url+tag['href'])
    return topic_urls


In [14]:
urls=get_topic_urls(doc)

In [15]:
len(urls)

30

In [16]:
urls[:1]

['https://github.com/topics/3d']

#### putting all this together as a single function

In [62]:
def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    doc = BeautifulSoup(response.text, 'html.parser')
    topics_dict={
        'title':get_topic_titles(doc),
        'description':get_topic_descs(doc),
        'url':get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)
    

#### Getting top 25 repositories from a topic page

In [18]:
def get_topic_page(topic_url):
    # Download the page
    response=requests.get(topic_url)
    # Check successful response
    if response.status_code!=200:
        raise Exception('Failed to load page {}'.format(topic_url))
    topic_doc=BeautifulSoup(response.text,'html.parser')
    return topic_doc


In [19]:
docu=get_topic_page("https://github.com/topic/3d")

In [20]:
def parse_star_count(stars_str):
    stars_str=stars_str.strip()
    if stars_str[-1]=='k':
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)

In [67]:
def get_repo_info(h3_tag,star_tag):
    base_url="https://github.com"
    a_tags=h3_tag.find_all('a')
    username=a_tags[0].text.strip()
    repo_name=a_tags[1].text.strip()
    repo_url=base_url+a_tags[1]['href']
    stars=parse_star_count(star_tag.text.strip())
    return username,repo_name,stars,repo_url

In [22]:
import pandas as pd

In [57]:
def get_topic_repos(topic_doc):
    h3_selection_class='f3 color-fg-muted text-normal lh-condensed'
    repos_tags=topic_doc.find_all('h3',{'class':h3_selection_class})
    #get star tags
    star_tags=topic_doc.find_all('span',{'class':'Counter js-social-count'})
    topic_repos_dict={'username':[],
                 'repo_name':[],
                 'stars':[],
                 'repo_url':[]}
    for i in range(len(repos_tags)):
        repo_info=get_repo_info(repos_tags[i],star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
    return pd.DataFrame(topic_repos_dict)


In [58]:
import os

In [59]:
def scrape_topic(topic_url, path):
    if os.path.exists(path):
        print("The file {} already exists. Skipping...".format(path))
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path, index=None)
    

##### Putting it all together
1. We have a function to get the list of topics
2. We have a function to create a CSV file for scraped repos from a topics page
3. Let's create a function to put them together

In [65]:
def scrape_topics_repos():
    print("Scrapping list of topics")
    topics_df=scrape_topics()
    
    os.makedirs('data',exist_ok=True)
    for index, row in topics_df.iterrows():
        print("Scrapping top repositories of {}".format(row['title']))
        scrape_topic(row['url'],'data/{}.csv'.format(row['title']))

In [68]:
scrape_topics_repos()

Scrapping list of topics
Scrapping top repositories of 3D
Scrapping top repositories of Ajax
Scrapping top repositories of Algorithm
Scrapping top repositories of Amp
Scrapping top repositories of Android
Scrapping top repositories of Angular
Scrapping top repositories of Ansible
Scrapping top repositories of API
Scrapping top repositories of Arduino
Scrapping top repositories of ASP.NET
Scrapping top repositories of Atom
Scrapping top repositories of Awesome Lists
Scrapping top repositories of Amazon Web Services
Scrapping top repositories of Azure
Scrapping top repositories of Babel
Scrapping top repositories of Bash
Scrapping top repositories of Bitcoin
Scrapping top repositories of Bootstrap
Scrapping top repositories of Bot
Scrapping top repositories of C
Scrapping top repositories of Chrome
Scrapping top repositories of Chrome extension
Scrapping top repositories of Command line interface
Scrapping top repositories of Clojure
Scrapping top repositories of Code quality
Scrapping t