# Import all libraries

In [2]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

## Scrape topic list and write a csv file
#### Store name of the topic, description and topic url first for further use to scrape individual topics related information

In [3]:
topics_url = 'https://github.com/topics'
response = requests.get(topics_url)
response.status_code

200

In [4]:
page_contents = response.text
len(page_contents)

126394

In [5]:
# saving html file
with open('webpage.html', 'w') as f:
    f.write(page_contents)

In [6]:
doc = BeautifulSoup(page_contents, 'html.parser')

In [7]:
selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = doc.find_all('p', {'class':selection_class})

In [8]:
desc_selector = 'f5 color-text-secondary mb-0 mt-1'
topic_desc_tags = doc.find_all('p', {'class': desc_selector})

In [9]:
topic_link_selector = 'd-flex no-underline'
topic_link_tags = doc.find_all('a', {'class':topic_link_selector})

In [10]:
# we will use them later in final scrapping
topic_titles = []
for tag in topic_title_tags:
    topic_titles.append(tag.text)
print(topic_titles)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


In [11]:
topic_descs = []

for tag in topic_desc_tags:
    topic_descs.append(tag.text.strip())
    
topic_descs[:2]

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.']

In [12]:
# # we will use them later in final scrapping
topic_urls = []
base_url = 'https://github.com'

for tag in topic_link_tags:
    topic_urls.append(base_url + tag['href'])
    
topic_urls[:5]

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android']

In [13]:
topics_dict = {
    'title':topic_titles,
    'description':topic_descs,
    'url': topic_urls
}

In [14]:
topics_df = pd.DataFrame(topics_dict)

In [15]:
topics_df.to_csv('topics.csv', index=None)

## Now lets scrape individual topics related data
- username
- reponame
- repolink
- star

In [22]:
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1]) * 1000)
    return int(stars_str)

def scrape_and_save(topic_url, title):
    response = requests.get(topic_url)
    code = response.status_code
    if code!=200:
        print("Something wrong! Response is unsuccessful")
        return
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    
    h1_selection_class = 'f3 color-text-secondary text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h1', {'class': h1_selection_class})
    
    username = []
    reponame = []
    repourl = []
    star = []
    
    for repo_tag in repo_tags:
        a_tags = repo_tag.find_all('a')
        username.append(a_tags[0].text.strip())
        reponame.append(a_tags[1].text.strip())
        base_url = 'https://github.com'
        repo_url = base_url + a_tags[1]['href']
        repourl.append(repo_url)
        
    star_tags = topic_doc.find_all('a', { 'class': 'social-count float-none'})
    for star_tag in star_tags:
        star_count = parse_star_count(star_tag.text.strip())
        star.append(star_count)
        
    topic_repo_dict = {
    'title':title,
    'username':username,
    'reponame': reponame,
    'repourl':repourl,
    'star': star
    }
    topic_repo_df = pd.DataFrame(topic_repo_dict)
    topic_repo_df.to_csv(f'repo_data/{title}.csv', index=None)
    return topic_repo_df


In [23]:
scrape_and_save('https://github.com/topics/3d','3d')

Unnamed: 0,title,username,reponame,repourl,star
0,3d,mrdoob,three.js,https://github.com/mrdoob/three.js,70200
1,3d,libgdx,libgdx,https://github.com/libgdx/libgdx,18400
2,3d,BabylonJS,Babylon.js,https://github.com/BabylonJS/Babylon.js,13900
3,3d,pmndrs,react-three-fiber,https://github.com/pmndrs/react-three-fiber,13100
4,3d,aframevr,aframe,https://github.com/aframevr/aframe,12700
5,3d,ssloy,tinyrenderer,https://github.com/ssloy/tinyrenderer,10600
6,3d,lettier,3d-game-shaders-for-beginners,https://github.com/lettier/3d-game-shaders-for...,9800
7,3d,FreeCAD,FreeCAD,https://github.com/FreeCAD/FreeCAD,9300
8,3d,metafizzy,zdog,https://github.com/metafizzy/zdog,8400
9,3d,CesiumGS,cesium,https://github.com/CesiumGS/cesium,7000


## Now it's time to scrape all of them

In [34]:
for topic_title, topic_url in zip(topic_titles, topic_urls):
    print(topic_title, topic_url)
    print(f"The topic {topic_title} is scrapping.......")
    scrape_and_save(topic_url,topic_title)
    print(f"The topic {topic_title} is scrapped and saved sucessfully :)")

3D https://github.com/topics/3d
The topic 3D is scrapping.......
The topic 3D is scrapped and saved sucessfully :)
Ajax https://github.com/topics/ajax
The topic Ajax is scrapping.......
The topic Ajax is scrapped and saved sucessfully :)
Algorithm https://github.com/topics/algorithm
The topic Algorithm is scrapping.......
The topic Algorithm is scrapped and saved sucessfully :)
Amp https://github.com/topics/amphp
The topic Amp is scrapping.......
The topic Amp is scrapped and saved sucessfully :)
Android https://github.com/topics/android
The topic Android is scrapping.......
The topic Android is scrapped and saved sucessfully :)
Angular https://github.com/topics/angular
The topic Angular is scrapping.......
The topic Angular is scrapped and saved sucessfully :)
Ansible https://github.com/topics/ansible
The topic Ansible is scrapping.......
The topic Ansible is scrapped and saved sucessfully :)
API https://github.com/topics/api
The topic API is scrapping.......
The topic API is scrapped