# Project Outlines:
- We are going to Scrape https://github.com/topics
- We'll get a list of topics. For each topic, we'll get topic title, topic page URL and topic description
- For each topic, we'll get the top 25 repositories in the topic from the topic page
- For each repository, we'll grab the repo name, username, stars and repo URL
- For each topic we will create a CSV file in the following format
     
 Repo Name,Username,Stars,Repo URL
    


## Use the requests library to download web pages

In [16]:
import requests 

In [406]:
import pandas as pd

In [17]:
topics_url = 'https://github.com/topics'

In [18]:
response = requests.get(topics_url)

In [19]:
response.status_code

200

In [20]:
 len(response.text)

138620

In [29]:
page_contents = response.text

In [32]:
with open('webpage.html', 'w',encoding="utf-8") as f:
    f.write(page_contents)


## Use Beautiful Soup to parse and extract information

In [7]:
from bs4 import BeautifulSoup 

In [42]:
doc = BeautifulSoup(page_contents,'html.parser')

In [60]:
topic_title_tags = doc.find_all('p', class_ = 'f3 lh-condensed mb-0 mt-1 Link--primary')

In [62]:
topic_desc_tags = doc.find_all('p',class_ = 'f5 color-fg-muted mb-0 mt-1')

In [65]:
topic_url_page = doc.find_all('a', class_ = 'no-underline flex-grow-0')

In [89]:
# for i in range(len(topic_url_page)):
#     print('https://github.com' + topic_url_page[i]['href'])
    

In [90]:
# for i in topic_url_page:
#     url = i.get('href')
#     print('https://github.com' +url)

In [106]:
topic_titles = []
for tag in topic_title_tags:
    topic_titles.append(tag.text)

In [117]:
topic_descriptions = []
for tag in topic_desc_tags:
    topic_descriptions.append(tag.text.replace('\n','').strip())

In [110]:
topic_urls = []
for tag in range(len(topic_url_page)):
    topic_urls.append('https://github.com' + topic_url_page[tag]['href'])
    

In [131]:
topic ={'topic': topic_titles, 'desc': topic_descriptions, 'urls': topic_urls}

In [398]:
df = pd.DataFrame(topic)

In [423]:
with open('Github_Topics.csv', 'w',encoding="utf-8") as f:
    f.write(df.to_csv(index = False))

# Now lets make our code cleaner

In [401]:
def get_topic_page():
    topic_url = 'https://github.com/topics'
    response = requests.get(topic_url)
    if response.status_code != 200:
        raise Exception(f'Failed to load the {url}')
    doc = BeautifulSoup(response.text, 'html.parser')
    return doc
    

In [402]:
doc = get_topic_page()

In [403]:
def get_topic_titles(doc):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p', class_ = selection_class)
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles
    

In [404]:
def get_topic_desc(doc):
    selection_class = 'f5 color-fg-muted mb-0 mt-1'
    topic_description_tags = doc.find_all('p', class_ = selection_class)
    topic_descriptions = []
    for tag in topic_description_tags:
        topic_descriptions.append(tag.text.strip())
    return topic_descriptions
    

In [405]:
def get_topic_url(doc):
    selection_class = 'no-underline flex-1 d-flex flex-column'
    topic_title_urls = doc.find_all('a', class_ = selection_class)
    topic_urls = []
    defaulturl = 'https://github.com'
    for url in topic_title_urls:
        topic_urls.append(defaulturl + url.get('href'))
    return topic_urls
        

In [462]:
def scrap_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code!=200:
        raise Exception(f'The urls {topics_url} cannot be reached')
    doc = BeautifulSoup(response.text, 'html.parser')
    titles = {
        'Title':get_topic_titles(doc),
        'Description':get_topic_desc(doc),
        'Urls': get_topic_url(doc)
    }
    df = pd.DataFrame(titles)
    with open('Github_Topics.csv', 'w',encoding="utf-8") as f:
        f.write(df.to_csv(index = False))
    return df    

In [425]:
scrap_topics()

Unnamed: 0,Title,Description,Urls
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


### Getting information out of a topic page

In [408]:
response  = requests.get('https://github.com/topics/3d')

In [409]:
soup = BeautifulSoup(response.text, 'html.parser')

In [410]:
names = soup.find_all('h3', 'f3 color-fg-muted text-normal lh-condensed' )

In [411]:
repo_name = []
username = []
urls = []
for i in names:
    b = i.find_all('a')
    username.append(b[0].text.replace('\n','').strip())
    repo_name.append(b[1].text.replace('\n','').strip())
    urls.append('https://github.com' + b[1]['href'])
    

In [415]:
stars = soup.find_all('span', class_='Counter js-social-count')

In [416]:
repo_stars = []
for i in stars:
    repo_stars.append(i.text)

In [426]:
repo_details ={'Repo_Name':repo_name,
        'Username': username,
        'Stars' :repo_stars ,
        'URL' : urls
              }

In [428]:
dataframe = pd.DataFrame(repo_details)

In [429]:
dataframe

Unnamed: 0,Repo_Name,Username,Stars,URL
0,three.js,mrdoob,80.6k,https://github.com/mrdoob/three.js
1,libgdx,libgdx,19.8k,https://github.com/libgdx/libgdx
2,react-three-fiber,pmndrs,17.4k,https://github.com/pmndrs/react-three-fiber
3,Babylon.js,BabylonJS,16.2k,https://github.com/BabylonJS/Babylon.js
4,aframe,aframevr,14k,https://github.com/aframevr/aframe
5,tinyrenderer,ssloy,13.3k,https://github.com/ssloy/tinyrenderer
6,3d-game-shaders-for-beginners,lettier,12.5k,https://github.com/lettier/3d-game-shaders-for...
7,FreeCAD,FreeCAD,11k,https://github.com/FreeCAD/FreeCAD
8,zdog,metafizzy,9.1k,https://github.com/metafizzy/zdog
9,cesium,CesiumGS,8.5k,https://github.com/CesiumGS/cesium


# Now lets Write it in cleaner form


In [441]:
def get_topic_page(topic_url):
    response  = requests.get(topic_url)
    if response.status_code!= 200:
        raise Exception(f'The url {topic_url} is not available')
    soup = BeautifulSoup(response.text,'html.parser' )
    return soup
    

In [442]:
doc = get_topic_page('https://github.com/topics/3d')

In [452]:
def get_reponame_and_username(doc):
    names = doc.find_all('h3', 'f3 color-fg-muted text-normal lh-condensed')
    repo_names = []
    usernames = []
    urls= []
    for i in names:
        b = i.find_all('a')
        usernames.append(b[0].text.replace('\n','').strip())
        repo_names.append(b[1].text.replace('\n','').strip())
        urls.append('https://github.com' + b[1]['href'])
    return repo_names , usernames, urls
    
    

In [454]:
def get_stars(doc):
    stars = doc.find_all('span', class_='Counter js-social-count')
    repo_stars = []
    for i in stars:
        repo_stars.append(i.text)
    return repo_stars

In [459]:
def scrape(doc):
    repo_details = {
        'Repo_Name':get_reponame_and_username(doc)[0],
        'Username': get_reponame_and_username(doc)[1],
        'Stars' : get_stars(doc),
        'URL' : get_reponame_and_username(doc)[2]
        
    }
    dataframe = pd.DataFrame(repo_details)
    with open('Repo_details.csv', 'w',encoding="utf-8") as f:
        f.write(dataframe.to_csv(index = False))
    return dataframe
    

In [460]:
scrape(doc)

Unnamed: 0,Repo_Name,Username,Stars,URL
0,three.js,mrdoob,80.6k,https://github.com/mrdoob/three.js
1,libgdx,libgdx,19.8k,https://github.com/libgdx/libgdx
2,react-three-fiber,pmndrs,17.4k,https://github.com/pmndrs/react-three-fiber
3,Babylon.js,BabylonJS,16.2k,https://github.com/BabylonJS/Babylon.js
4,aframe,aframevr,14k,https://github.com/aframevr/aframe
5,tinyrenderer,ssloy,13.3k,https://github.com/ssloy/tinyrenderer
6,3d-game-shaders-for-beginners,lettier,12.5k,https://github.com/lettier/3d-game-shaders-for...
7,FreeCAD,FreeCAD,11k,https://github.com/FreeCAD/FreeCAD
8,zdog,metafizzy,9.1k,https://github.com/metafizzy/zdog
9,cesium,CesiumGS,8.5k,https://github.com/CesiumGS/cesium
