# Web-Scrapping(GitHub)
___

Importing the supplimentary libraries

In [1]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup

Create a class for scrapping the github website to find the GitHub Topics and list of repositories related to particular topic

In [2]:
class Scrap_GitHub:

        
    def get_topic_title(self, doc):
        '''
        returns list all Topic Titles from the document page of Github
        
        '''
        self.doc = doc
        
        # each topic is inspected and topic is collected from the page
        class_selections = 'f3 lh-condensed mb-0 mt-1 Link--primary'
        topic_title_tags = self.doc.find_all('p', {'class': class_selections})
        topic_link_tags = self.doc.find_all('a', {'class' : 'no-underline flex-grow-0'})
        # to get list of all the topic titles
        topic_titles = [] 
        for tag in topic_title_tags:
            topic_titles.append(tag.text)
        return topic_titles
    
    
    
    def get_desc(self, doc):
        '''
        returns list the description of each topic
        
        '''
        self.doc = doc
        # getting the description of each topic
        description_selection = 'f5 color-fg-muted mb-0 mt-1'
        description_tags =  self.doc.find_all('p', {'class': description_selection})
        topic_desc =[]
        for tag in description_tags:
            topic_desc.append(tag.text.strip())
        
        return topic_desc
    
    
    def get_urls(self, doc):
        self.doc = doc
        topic_link_tags = doc.find_all('a', {'class' : 'no-underline flex-grow-0'})
        topic_urls = []
        base_url = 'https://github.com'
        for tag in topic_link_tags:
            topic_urls.append((base_url + tag['href']).strip())
        return topic_urls
    
    
    
    def topic(self):
        '''
        
        Returns Topics from the GitHub page in the form of DataFrame.
        BeautifulSoup() is used to parse through the data of topic page on GitHub
        
        '''
        topics_url = 'https://github.com/topics'      # Github topic page link
        response = requests.get(topics_url)           # gets data from GitHub page
        if  response.status_code != 200:              # shows exception
            raise Exception(f'Failed to load the page{topics_url}')
        page_content = response.text                  # gets text of data
        doc = BeautifulSoup(page_content, 'html.parser')   # to parse through the data
        
        
        # creating a dictionary containing information of topic and URL of each Topic
        # converting it into a DataFrame 
        
        topic_df = pd.DataFrame({
                        'Topics': self.get_topic_title(doc),    # returns topic title
                        'Description' : self.get_desc(doc),     # returns Description of topic
                        'URLs' : self.get_urls(doc)             # returns URL of each topic
        }, index = None)
        
        return topic_df
    def parse_star_count(self, stars_str):
        self.stars_str = stars_str
        stars_str = self.stars_str.text.strip(' \n\n\n          Star\n ')
        if stars_str[-1] == 'k':
            return int(float(stars_str[:-1])*1000) # k means thousand
        return int(stars_str)
    def select_topic_x(self, topic_page_url):
        """
        returns the list of repositories of selected topic
        """
        self.topic_page_url = topic_page_url
        response = requests.get(self.topic_page_url)
        topic_doc = BeautifulSoup(response.text, 'html.parser')

        return topic_doc
    
    
    def select_topic(self, topic_page_url):
        """
        returns the list of repositories of selected topic
        """
        self.topic_page_url = topic_page_url
        
        response = requests.get(self.topic_page_url)
        if response.status_code != 200:
            raise Exception('Failed to load page {}'.format(self.topic_page_url))

        topic_doc = BeautifulSoup(response.text, 'html.parser')

        return topic_doc


    def get_repo_info(self, repo_tag, star_tags):
        '''
        returns all the required info about a repository

        '''
        self.repo_tag = repo_tag
        self.star_tags = star_tags
        base_url = 'https://github.com' 

        a_tags = self.repo_tag.find_all('a')
        username = a_tags[0].text.strip()
        repo_name = a_tags[1].text.strip()
        repo_url = base_url + a_tags[1]['href']
        stars = self.parse_star_count(self.star_tags)
        return username, repo_name, stars, repo_url
    
    
    def repo_info(self, topic_doc):
        self.topic_doc = topic_doc
        repo_tags = self.topic_doc.find_all('h3', {'class' : 'f3 color-fg-muted text-normal lh-condensed'})
        star_tags = self.topic_doc.find_all('a', {'class' : 'tooltipped tooltipped-s btn-sm btn BtnGroup-item color-bg-default'})

        dic = {
        'UserName': [],
        'Repository Name':[],
        'Stars':[],
        'URL':[]
        }


        for i in range(len(repo_tags)):
            repo_info = self.get_repo_info(repo_tags[i], star_tags[i])
            dic['UserName'].append(repo_info[0])
            dic['Repository Name'].append(repo_info[1])
            dic['Stars'].append(repo_info[2])
            dic['URL'].append(repo_info[3])
        return pd.DataFrame(dic)

    
    
    def supplimentary(self, topic_url, path):
        
        self.topic_url = topic_url
        self.path = path
        path = self.path + '.csv'
        if os.path.exists(path):
            print(f'The file {path} already exist. skipping...')
            return
        topic_df = self.repo_info(self.select_topic(self.topic_url))
        topic_df.to_csv(path, index= None)
        
        
        
    # Main method in this Class
    def scrape_all(self):
        '''
        
        This is the Main method of scrape_GitHub class which has accesss to all other methods of the class.
        When we call scrape_all using object it will download the data in the form of csv file in the selected folder
        Here the folder is GitHub_topics.
        
        topic() returns the dataframe containting the Topics and their URLs 
        by using iterrows() we try to iterate over the dataframe of topic and collect information of each topic
        
        
        '''
        print("Scraping top topics From GitHub")
        topics_df = self.topic()        # returns a DataFrame of topics
        os.makedirs('GitHub_Topics', exist_ok = True)      # creates a folder
        for index, row in topics_df.iterrows():                     # iterates over a DataFrame and collect info using URLs
            print('Scraping top repositories for "{}"'.format(row['Topics']))
            self.supplimentary(row['URLs'], 'GitHub_Topics/{}'.format(row['Topics']))


In [3]:
obj = Scrap_GitHub()

In [1]:
obj.scrape_all()

Here is how we can scrape the Github Website.☑️