# <font color=green>GitHub Topics Web Scraping Project</font>

    Date: 17/05/2022
    Owner: Rajesh Sinha
    
***
## <font color=purple>Project Description</font>
- Extract information on all the topics available on GitHub
- Extract information about first 1000 repositories of all topics

- For every topic available,
    - Extract topic name
    - Extract topic description
    - Extract total number of repositories available
    - Extract URL of that particular topic page
    - Save these data into a csv file

- For every repository available,
    - Extract name of the repository,
    - Extract owner name of the repository
    - Extract stars available on the repository
    - Extract URL of the repository
    - Save these datas for each topic in a csv file

## <font color=purple>Technology Description</font>
- Python
- Jupyter Notebook
- Selenium
- BeautifulSoup4
- Pandas

## <font color=purple>Future Improvements</font>
- Add multi-threading to reduce time taken to fetch data (current time ~ 6H)
- Fetch some more public informations about the topics
- Fetch some more public informations about the repositories
- Fetch informations about repository owners

## <font color=purple>About The Project</font>
* __INTENTION OF THIS PROJECT IS FOR PRACTICE AND EDUCATIONAL PURPOSE ONLY__</br>
* __ONLY THOSE INFORMATIONS HAVE BEEN COLLECTED IN THIS PROJECT WHICH ARE PUBLICLY AVAIALABLE ON [GitHub](https://github.com/topics)__</br>
* __USAGE OF THIS PROJECT FOR ANY COMMERCIAL PURPOSE IS STRICTLY PROHIBITED__
***

## <font color=green>Install Python Libraries</font>

In [None]:
pip install selenium

In [None]:
pip install webdriver-manager

In [None]:
pip install beautifulsoup4

In [None]:
pip install pandas

***

## <font color=green>Import Python Modules</font>

In [None]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

from bs4 import BeautifulSoup

import requests
import pandas as pd

import os
from datetime import date
import shutil

***

## <font color=green>Open Topics Page and Save Topic URLs</font>

In [None]:
# get executable webdriver path
_executable_webdriver_path = ChromeDriverManager().install()

# function to get webdriver
def _get_web_driver(_headless: bool=False):
    _options = Options()
    _options.headless=_headless
    
    # create webdriver with options
    return Chrome(
        service = Service(executable_path=_executable_webdriver_path),
        options=_options
    )
        
# funtion to check if element is visible
def _is_element_is_visible(_wait: WebDriverWait, _by: By, _selector: str):
    try:
        _wait.until(method=EC.presence_of_element_located(locator=(_by, _selector)))
        _wait.until(method=EC.visibility_of_element_located(locator=(_by, _selector)))
        return True
    except (NoSuchElementException, TimeoutException):
        return False

# function to click on element
def _click_on_element(_wait: WebDriverWait, _by: By, _selector: str, _driver: Chrome):
    try:
        _driver.execute_script("arguments[0].scrollIntoView();",_driver.find_element(by=_by, value=_selector))
        _wait.until(EC.element_to_be_clickable(mark=(_by, _selector))).click()
    except NoSuchElementException:
        pass

# base URL
_github_url = "https://github.com"

# get webdriver without options
_driver = _get_web_driver(_headless=True)

# create wait
_wait = WebDriverWait(_driver, 10)

# open GitHub Topics Page
_driver.get(url=_github_url+"/topics")
_driver.maximize_window()

# load full page
_load_more_button_xpath = ".//button[contains(text(),'Load more')]"
while _is_element_is_visible(_wait, By.XPATH, _load_more_button_xpath):
    _click_on_element(_wait, By.XPATH, _load_more_button_xpath, _driver)

# create Beautiful Soup object
_github_topics_page = BeautifulSoup(markup=_driver.page_source, features="html.parser")

# list out all topic URLs
_topic_name_tags = _github_topics_page.find_all(name="a", attrs={"class": "no-underline flex-1 d-flex flex-column"})
_TOPIC_NAMES = [_topic_name_tag.find(name="p", attrs={"class": "f3 lh-condensed mb-0 mt-1 Link--primary"}).get_text().strip() for _topic_name_tag in _topic_name_tags]
_TOPIC_DESCRIPTIONS = [_topic_name_tag.find(name="p", attrs={"class": "f5 color-fg-muted mb-0 mt-1"}).get_text().strip() for _topic_name_tag in _topic_name_tags]
_TOPIC_URLS = [_github_url+_topic_name_tag["href"] for _topic_name_tag in _topic_name_tags]
_TOPIC_REPOSITORY_COUNTS = []

print("FINISHED SCRAPING {} TOPICS".format(len(_TOPIC_URLS)))

# quit webdriver
_driver.quit()

***

## <font color=green>Open Each Topic Page and Save Repository Data</font>

In [None]:
# create repository data store
_REPOSITORY_DATA = {}

# fetch repository data for each topic
for _topic_url in _TOPIC_URLS:
    # get webdriver without options
    _driver = _get_web_driver(_headless=True)

    # create wait
    _wait = WebDriverWait(_driver, 10)

    # open each topic page
    _driver.get(url=_topic_url)
    _driver.maximize_window()

    # load full page
    _load_more_button_xpath = ".//button[contains(text(),'Load more')]"
    while _is_element_is_visible(_wait, By.XPATH, _load_more_button_xpath):
        _click_on_element(_wait, By.XPATH, _load_more_button_xpath, _driver)

    # create BeautifulSoup object
    _github_topic_details_page = BeautifulSoup(markup=_driver.page_source, features="html.parser")

    # read topic name
    _TOPIC_NAME = _github_topic_details_page.find(name="h1", attrs={"class": "h1"}).get_text().strip()
    
    # list all repository usernnames, repository names and repository URLs
    _name_headings = _github_topic_details_page.find_all(name="h3", 
                                                        attrs={"class": "f3 color-fg-muted text-normal lh-condensed"})
    _REPOSITORY_USERNAMES = [_name_heading.find_all(name="a")[0].get_text().strip() for _name_heading in _name_headings]
    _REPOSITORY_NAMES = [_name_heading.find_all(name="a")[1].get_text().strip() for _name_heading in _name_headings]
    _REPOSITORY_URLS = [_github_url+_name_heading.find_all(name="a")[1]["href"].strip() for _name_heading in _name_headings]

    # list all repository stars
    _star_tags = _github_topic_details_page.find_all(name="span", attrs={"id": "repo-stars-counter-star"})
    _REPOSITORY_STARS = [_star_tag.get_text().strip() for _star_tag in _star_tags]
    
    # add repository count to list
    _TOPIC_REPOSITORY_COUNTS.append(len(_REPOSITORY_NAMES))
    
    # add fetched data to repository data stores
    _REPOSITORY_DATA[_TOPIC_NAME] = {
        "REPOSITORY USERNAME": _REPOSITORY_USERNAMES,
        "REPOSITORY NAME": _REPOSITORY_NAMES,
        "REPOSITORY STARS": _REPOSITORY_STARS,
        "REPOSITORY URL": _REPOSITORY_URLS
    }

    # quit driver
    print("FINISHED SCRAPING DATA FOR TOPIC: {}, {} REPOSITORIES FOUND".format(_TOPIC_NAME, len(_REPOSITORY_URLS)))
    _driver.quit()

# prepare topic data store
_TOPIC_DATA = {
    "TOPIC NAME": _TOPIC_NAMES,
    "TOPIC DESCRIPTION": _TOPIC_DESCRIPTIONS,
    "REPOSITORY COUNT": _TOPIC_REPOSITORY_COUNTS,
    "TOPIC URL": _TOPIC_URLS
}
    

***

## <font color=green>Save Data To CSV Files</font>

In [None]:
# base output directory path
_base_output_directory_path = "output/"

# check if base output directory exists
if not os.path.exists(_base_output_directory_path):
    # create base output directory
    os.mkdir(_base_output_directory_path)

# output directory path
_output_directory_path=_base_output_directory_path+"github_topics_and_repositories_scraped_data_"+date.today().strftime("%d_%m_%Y")+"/"

# check if output directory exists
if os.path.exists(_output_directory_path):
    # clear output directory
    shutil.rmtree(_output_directory_path)

# create output directory by date
os.mkdir(_output_directory_path)

# save topic data
pd.DataFrame(_TOPIC_DATA).to_csv(path_or_buf=_output_directory_path+"topic_data.csv", index=False)

# save repository datas
for _topic_name in _REPOSITORY_DATA:
    pd.DataFrame(_REPOSITORY_DATA[_topic_name]).to_csv(path_or_buf=_output_directory_path+_topic_name+"_data.csv", index=False)
    