# Scraping the top repositories for various topics on github

**TODO:**<br>
- Introduction about web scraping
- Introduction about github and the problem statement
- The tools we will be using: Python, requests, Beautiful Soup, pandas

**Project Strategy:**
* We will scrape: https://github.com/topics (To get a list of topics)
* For each topic we will get: Topic title, Topic URL, Topic description
* For each topic we will get top 30 repositories
* For each repository we will get: 
  repo name, username, stars, repo url
* We will store all this data in a csv file, with a separate csv file for each topic

```

```

### Scrape the list of topics from github's topic page

In [None]:
# Install/upgrade requests

!pip install requests --upgrade --quiet

In [None]:
# Install/upgrade beautifulsoup

!pip install beautifulsoup4 --upgrade --quiet

### Imports 

In [None]:
import requests

from bs4 import BeautifulSoup

import pandas as pd

import os

import time

### Get info on all the topics from the topics page(s)


In [None]:
# Download the topic page (using below 3 helper functions)

def scrape_topic_titles():
    topics_dict = {
        'title': [],
        'description': [],
        'url': []
    }
    
    for page_num in range(1,7):
        topics_url = f'https://github.com/topics?page={page_num}'
        response = requests.get(topics_url)
        while response.status_code != 200:   # if the webpage doesn't load, try again after 1 second
            time.sleep(1)
            response = requests.get(topics_url)

        rc = response.text
        doc = BeautifulSoup(rc, 'html.parser') 
        
        topics_dict['title'] += get_topic_titles(doc)
        topics_dict['description'] += get_topic_descriptions(doc)
        topics_dict['url'] += get_topic_urls(doc)
    
    topics_df = pd.DataFrame(topics_dict)
    return topics_df

scrape_topic_titles()

In [None]:
# Helper functions to help download the topic page

# get titles
def get_topic_titles(doc):
    topic_title_tags = doc.find_all('p', {'class': "f3 lh-condensed mb-0 mt-1 Link--primary"}) 
    topic_titles = [topic_title_tags[i].text for i in range(len(topic_title_tags))]
    return topic_titles

# get urls
def get_topic_urls(doc):
    topic_title_tags = doc.find_all('p', {'class': "f3 lh-condensed mb-0 mt-1 Link--primary"}) 
    topic_urls = ['https://github.com' + topic_title_tags[i].parent['href'] for i in range(len(topic_title_tags))]
    return topic_urls

# get descriptions
def get_topic_descriptions(doc):
    topic_descr_tags = doc.find_all('p', {'class': "f5 color-fg-muted mb-0 mt-1"})
    topic_descrs = [topic_descr_tags[i].text.strip() for i in range(len(topic_descr_tags))]
    return topic_descrs

In [None]:
# (helper fn) converts "num of stars" from string to number 

def parse_stars_count(num_stars):
    num_stars = num_stars.strip()
    if num_stars[-1] == 'k':
        int_num_stars = int(float(num_stars[:-1])*1000)
        return int_num_stars
    return int(num_stars)

In [None]:
# (helper fn) fetches the beautified html text from a given url

def get_topic_doc(topic_url):
    # request html for the given topic link
    response = requests.get(topic_url)
    # if it fails to fetch data
    if response.status_code != 200:   
        raise Exception(f'Failed to load page {topic_url}')
    # if it successfully fetches data 
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc

In [None]:
# (Main fn) pass the beautified html text and it will return a dataframe with info

def get_topic_info(topic_doc):
    """returns a dataframe containing info on all repos for the chosen topic"""
    # get h3 tags containing usernames, repo names and urls
    repo_tags = topic_doc.find_all('h3', {'class':'f3 color-fg-muted text-normal lh-condensed'})
    total_tags = len(repo_tags)
    # parse usernames
    usernames = [repo_tags[i].find_all('a')[0].text.strip() for i in range(len(repo_tags))]
    # parse repo names
    repo_names = [repo_tags[i].find_all('a')[1].text.strip() for i in range(len(repo_tags))]
    # parse urls for repos
    base_url = 'https://github.com'
    repo_urls = [base_url + repo_tags[i].find_all('a')[1]['href'] for i in range(len(repo_tags))]
    
    # get span tags containing num of stars
    stars_tags = topic_doc.find_all('span', {'class': 'Counter js-social-count'})
    # parse num of stars
    num_stars = [stars_tags[i].text for i in range(len(stars_tags))]
    stars = list(map(parse_stars_count, num_stars))
    
    # create dictionary for above four data points
    topic_repos_dict = {'username':usernames, 'repo_name': repo_names, 'stars': stars, 'url': repo_urls}
    # convert to dataframe
    topic_info_df = pd.DataFrame(topic_repos_dict)
    return topic_info_df


### Get the top 30 repositories from a topic page 

In [None]:
# Using topic url and title, scrape repos of that topic and store them as "topic_title.csv"

def scrape_topic(topic_url, topic_title):
    fname = './topics_csv/' + topic_title + '.csv'
    if os.path.exists(fname):
        print(f'The file: "{fname}", already exists. Skipping...')
        return
    topic_df = get_topic_info(get_topic_doc(topic_url))
    topic_df.to_csv(fname, index = None)

### Function for scraping, using all the above defined functions


In [None]:

def scrape_topic_repos():
    topics_df = scrape_topic_titles()
    print('Scraping top repos of all topics from github')
    if not os.path.exists("topics_csv"):
        os.mkdir("topics_csv")
    for ix, rows in topics_df.iterrows():
        topic_url = rows['url']
        topic_title = rows['title']
        print(f'Scraping top repositories for topic: {topic_title}...')
        scrape_topic(topic_url, topic_title)
        

### Execute scraping 

In [None]:
scrape_topic_repos()