# Scraping Top Repositories for Topics on GitHub

In [349]:
import requests
from bs4 import BeautifulSoup

In [350]:
topics_url = 'https://github.com/topics'

###### Informational responses (100-199) | successful responses(200-299) | redirects(300-399) | client errors(400-499) | server errors(500-599)

In [351]:
response = requests.get(topics_url)
response.status_code

200

In [352]:
page_contents = response.text

In [353]:
page_contents[:1000]

'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark" data-a11y-animated-images="system">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-719f1193e0c0.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-0c343b529849.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" media="all" rel="stylesheet" data-href="https:/

### Use Beautiful Soup to parse and extract information

In [354]:
from bs4 import BeautifulSoup

In [355]:
doc = BeautifulSoup(page_contents, 'html.parser')

In [356]:
selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'

topic_title_tags = doc.find_all('p', {'class': selection_class})
len(topic_title_tags)

30

In [357]:
topic_title_tags[:5]

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>]

In [358]:
desc_selector = "f5 color-fg-muted mb-0 mt-1"
topic_desc_tags = doc.find_all('p', {'class': desc_selector})

In [359]:
topic_desc_tags[:5]

[<p class="f5 color-fg-muted mb-0 mt-1">
           3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Ajax is a technique for creating interactive web applications.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Algorithms are self-contained sequences that carry out a variety of tasks.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Amp is a non-blocking concurrency library for PHP.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Android is an operating system built by Google designed for mobile devices.
         </p>]

In [360]:
topic_title_tag0 = topic_title_tags[0]

In [361]:
div_tag = topic_title_tag0.parent

In [362]:
topic_link_tags = doc.find_all('a', {'class': 'no-underline flex-1 d-flex flex-column'})

In [363]:
len(topic_link_tags)

30

In [364]:
topic_link_tags[0]

<a class="no-underline flex-1 d-flex flex-column" href="/topics/3d">
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>
<p class="f5 color-fg-muted mb-0 mt-1">
          3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.
        </p>
</a>

In [365]:
topic0_url = "https://github.com" + topic_link_tags[0]['href']
print(topic0_url)

https://github.com/topics/3d


In [366]:


topic_titles = []

for tag in topic_title_tags:
    topic_titles.append(tag.text)
    
print(topic_titles)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


In [367]:
topic_descs = []

for tag in topic_desc_tags:
    topic_descs.append(tag.text.strip()) # strip remove space both start or end
    
topic_descs[:5]

['3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency library for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

In [368]:
topic_urls = []
base_url = 'https://github.com'

for tag in topic_link_tags:
    topic_urls.append(base_url + tag['href'])
    
topic_urls

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android',
 'https://github.com/topics/angular',
 'https://github.com/topics/ansible',
 'https://github.com/topics/api',
 'https://github.com/topics/arduino',
 'https://github.com/topics/aspnet',
 'https://github.com/topics/atom',
 'https://github.com/topics/awesome',
 'https://github.com/topics/aws',
 'https://github.com/topics/azure',
 'https://github.com/topics/babel',
 'https://github.com/topics/bash',
 'https://github.com/topics/bitcoin',
 'https://github.com/topics/bootstrap',
 'https://github.com/topics/bot',
 'https://github.com/topics/c',
 'https://github.com/topics/chrome',
 'https://github.com/topics/chrome-extension',
 'https://github.com/topics/cli',
 'https://github.com/topics/clojure',
 'https://github.com/topics/code-quality',
 'https://github.com/topics/code-review',
 'https://github.com/topics/compil

In [369]:
import pandas as pd

In [370]:
topics_dict = {
    'title': topic_titles,
    'description': topic_descs,
    'url': topic_urls
}

In [371]:
topics_df = pd.DataFrame(topics_dict)

In [372]:
topics_df

Unnamed: 0,title,description,url
0,3D,3D refers to the use of three-dimensional grap...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source platform for buildin...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


## creat csv file

In [373]:
topics_df.to_csv('topics.csv', index= None)

## Getting inforamation out of a topic page

In [374]:
topic_page_url= topic_urls[0]
topic_page_url

'https://github.com/topics/3d'

In [375]:
response= requests.get(topic_page_url)

In [376]:
response.status_code

200

In [377]:
len(response.text)

455644

In [378]:
topic_doc= BeautifulSoup(response.text,'html.parser')


In [379]:
h3_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
repo_tags = topic_doc.find_all('h3', {'class': h3_selection_class} )

In [380]:
repo_tags[0]

<h3 class="f3 color-fg-muted text-normal lh-condensed">
<a data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-turbo="false" data-view-component="true" href="/mrdoob">
            mrdoob
</a>          /
          <a class="text-bold wb-break-word" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d89752156923904a4238816bc9b51ab7772f3e3644ce897d8dd4e5" data-turbo="false" data-view-component="true" href="/mrdoob/three.js

In [381]:
len(repo_tags)

20

In [382]:
repo_tags[0]

<h3 class="f3 color-fg-muted text-normal lh-condensed">
<a data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-turbo="false" data-view-component="true" href="/mrdoob">
            mrdoob
</a>          /
          <a class="text-bold wb-break-word" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d89752156923904a4238816bc9b51ab7772f3e3644ce897d8dd4e5" data-turbo="false" data-view-component="true" href="/mrdoob/three.js

In [383]:
a_tags = repo_tags[0].find_all('a')

In [384]:
a_tags[0]

<a data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-turbo="false" data-view-component="true" href="/mrdoob">
            mrdoob
</a>

In [385]:
base_url = 'https://github.com'
repo_url = base_url + a_tags[1]['href']
print(repo_url)

https://github.com/mrdoob/three.js


In [386]:
star_tags = topic_doc.find_all('span',{'class':'Counter js-social-count'})

In [387]:
len(star_tags)

20

In [388]:
star_tags[0].text.strip()

'89.1k'

In [389]:
def parse_star_count(stars_str):
    stars_str= stars_str.strip()
    if stars_str[-1]=='k':
        stars_str[:-1]
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)

stars_str= '89.1k' | stars_str[-1] | int(float(stars_str[:-1])*1000)

-----------------------------
stars_str='891' |
int(stars_str)

In [390]:
parse_star_count(star_tags[0].text.strip())

89100

In [391]:
def get_repo_info(h3_tag, star_tag):
    a_tags= h3_tag.find_all('a')
    username= a_tags[0].text.strip()
    repo_name= a_tags[1].text.strip()
    repo_url= base_url+ a_tags[1]['href']
    stars= parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url

In [392]:
get_repo_info(repo_tags[0], star_tags[0])

('mrdoob', 'three.js', 89100, 'https://github.com/mrdoob/three.js')

In [394]:
topic_repos_dict= {
    'username': [],
    'repo_name': [],
    'stars': [],
    'repo_url': []
}

for i in range(len(repo_tags)):
    repo_info= get_repo_info(repo_tags[i], star_tags[i])
    topic_repos_dict['username'].append(repo_info[0])
    topic_repos_dict['repo_name'].append(repo_info[1])
    topic_repos_dict['stars'].append(repo_info[2])
    topic_repos_dict['repo_url'].append(repo_info[3])

In [395]:
topic_repos_df= pd.DataFrame(topic_repos_dict)
topic_repos_df

Unnamed: 0,username,repo_name,stars,repo_url
0,mrdoob,three.js,89100,https://github.com/mrdoob/three.js
1,pmndrs,react-three-fiber,21400,https://github.com/pmndrs/react-three-fiber
2,libgdx,libgdx,21100,https://github.com/libgdx/libgdx
3,BabylonJS,Babylon.js,19400,https://github.com/BabylonJS/Babylon.js
4,ssloy,tinyrenderer,16100,https://github.com/ssloy/tinyrenderer
5,aframevr,aframe,15100,https://github.com/aframevr/aframe
6,lettier,3d-game-shaders-for-beginners,14600,https://github.com/lettier/3d-game-shaders-for...
7,FreeCAD,FreeCAD,13200,https://github.com/FreeCAD/FreeCAD
8,CesiumGS,cesium,9900,https://github.com/CesiumGS/cesium
9,metafizzy,zdog,9600,https://github.com/metafizzy/zdog


## final code

In [396]:
def get_topic_page(topic_url):
    # Download the page
    response = requests.get(topic_url)
    # Check successful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    # Parse using Beautiful soup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc

def get_repo_info(h3_tag, star_tag):
    # returns all the required info about a repository
    a_tags= h3_tag.find_all('a')
    username= a_tags[0].text.strip()
    repo_name= a_tags[1].text.strip()
    repo_url= base_url+ a_tags[1]['href']
    stars= parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url

def get_topic_repos(topic_doc):
    # Get the h1 tags containing repo title, repo URL and username
    h3_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3', {'class': h3_selection_class})
    # Get star tags
   
    star_tags = topic_doc.find_all('span',{'class':'Counter js-social-count'})    
    topic_repos_dict = { 'username': [],
                        'repo_name': [],
                        'stars': [],
                        'repo_url': []
                       }
    # Get repo info
    for i in range(len(repo_tags)):
        repo_info= get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])

    return pd.DataFrame(topic_repos_dict)

def scape_topic(topic_url, topic_name):
    topic_df= get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(topic_name + '.csv', index=None) # to save as csv

# Write a single function to :

1. Get the list of topics from the topics page
2. Get the list of top repos from the individual topic pages
3. For each topic, create a CSV of the top repos for the topic

In [397]:
def get_topic_titles(doc):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p', {'class': selection_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

def get_topic_descs(doc):
    desc_selector = "f5 color-fg-muted mb-0 mt-1"
    topic_desc_tags = doc.find_all('p', {'class': desc_selector})
    topic_descs = []
    for tag in topic_desc_tags:
        topic_descs.append(tag.text.strip())
    return topic_descs

def get_topic_urls(doc):
    topic_link_tags = doc.find_all('a', {'class': 'no-underline flex-1 d-flex flex-column'})
    topic_urls = []
    base_url = 'https://github.com'
    for tag in topic_link_tags:
        topic_urls.append(base_url + tag['href'])
    return topic_urls
    

# to get list of topics
def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    topics_dict = {
        'title': get_topic_titles(doc),
        'description': get_topic_descs(doc),
        'url': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)

In [398]:
import os
help(os.makedirs)

Help on function makedirs in module os:

makedirs(name, mode=511, exist_ok=False)
    makedirs(name [, mode=0o777][, exist_ok=False])
    
    Super-mkdir; create a leaf directory and all intermediate ones.  Works like
    mkdir, except that any intermediate path segment (not just the rightmost)
    will be created if it does not exist. If the target directory already
    exists, raise an OSError if exist_ok is False. Otherwise no exception is
    raised.  This is recursive.



In [399]:
def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df= scrape_topics()
    # create folder here to store files
    os.makedirs('data', exist_ok= True)
    for index, row in topics_df.iterrows():
        print("Scraping top repositories for '{}'".format(row['title']))
        scape_topic(row['url'], 'data/{}.csv'.format(row['title']))

In [400]:
scrape_topics_repos()

Scraping list of topics
Scraping top repositories for '3D'
Scraping top repositories for 'Ajax'
Scraping top repositories for 'Algorithm'
Scraping top repositories for 'Amp'
Scraping top repositories for 'Android'
Scraping top repositories for 'Angular'
Scraping top repositories for 'Ansible'
Scraping top repositories for 'API'
Scraping top repositories for 'Arduino'
Scraping top repositories for 'ASP.NET'
Scraping top repositories for 'Atom'
Scraping top repositories for 'Awesome Lists'
Scraping top repositories for 'Amazon Web Services'
Scraping top repositories for 'Azure'
Scraping top repositories for 'Babel'
Scraping top repositories for 'Bash'
Scraping top repositories for 'Bitcoin'
Scraping top repositories for 'Bootstrap'
Scraping top repositories for 'Bot'
Scraping top repositories for 'C'
Scraping top repositories for 'Chrome'
Scraping top repositories for 'Chrome extension'
Scraping top repositories for 'Command line interface'
Scraping top repositories for 'Clojure'
Scrapin