### Geek for Geeks Web Scrapping 


In [0]:
#This will work in Google Colab Python 2.
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
#Change the Directory to gdrive so that you get the html file in there.
%cd content
%cd gdrive
%cd My Drive
!ls


In [4]:
import requests

from os import system
from sys import exit
from time import sleep
from requests.exceptions import ConnectionError

from bs4 import BeautifulSoup

class Article(object):
    """ This Class Contains the title and the content of the article.
    The title can be use as key to the article link for the navigation purpose. """

    def __init__(self, title, content):
        self.title = title
        self.content = content


BASE_URL = 'http://www.geeksforgeeks.org/'
articles = []

CHOICE_TO_CATEGORY_MAPPING = {
    1: 'c',
    2: 'c-plus-plus',
    3: 'java',
    4: 'python',
    5: 'fundamentals-of-algorithms',
    6: 'data-structures'
}


def display_menu():
    print("Choose category to scrape: ")
    print("1. C Language")
    print("2. C++ Language")
    print("3. Java")
    print("4. Python")
    print("5. Algorithms")
    print("6. Data Structures")


def get_category_choice():
    choice = int(raw_input("Enter choice: "))
    try:
        category_url = CHOICE_TO_CATEGORY_MAPPING[choice]
    except KeyError:
        print("Wrong Choice Entered. Exiting!")
        exit(1)
    return category_url


def save_articles_as_html_and_pdf():
    print("All links scraped, extracting articles")
    # Formatting the html for articles
    all_articles = (
        '<!DOCTYPE html>'
        '<html><head>'
        '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
        '<link rel="stylesheet" href="style.min.css" type="text/css" media="all" />'
        '<script src="https://cdn.rawgit.com/google/code-prettify/master/loader/run_prettify.js"></script>'
        '</head><body>'
    )

    all_articles += '<h1 style="text-align:center;font-size:40px">' + \
                    category_url.title() + ' Archive</h1><hr>'
    all_articles += '<h1 style="padding-left:5%;font-size:200%;">Index</h1><br/>'

    for x in range(len(articles)):
        all_articles += '<a href =\"#' + str(x+1) + '\">' + \
                        '<h1 style="padding-left:5%;font-size:20px;">' + \
                        str(x+1) + ".\t\t"+articles[x].title + '</h1></a> <br/>'
    for x in range(len(articles)):
        all_articles += '<hr id=\"' + str(x+1) +'\">' + articles[x].content.decode("utf-8")

    all_articles += '''</body></html>'''
    html_file_name = 'G4G_' + category_url.title() + '.html'
    html_file = open(html_file_name, "w")
    html_file.write(all_articles.encode("utf-8"))
    html_file.close()

    pdf_file_name = 'G4G_' + category_url.title() + '.pdf'
    print("Generating PDF " + pdf_file_name)
    html_to_pdf_command = 'wkhtmltopdf ' + html_file_name + ' ' + pdf_file_name
    system(html_to_pdf_command)


def scrape_category(category_url):
    print BASE_URL + category_url;
    try:
        soup = BeautifulSoup(requests.get(BASE_URL + category_url).text)
    except ConnectionError:
        print("Couldn't connect to Internet! Please check your connection & Try again.")
        exit(1)

    # Selecting links which are in the category page
    links = [a.attrs.get('href') for a in soup.select('article li a')]
    # Removing links for the categories with anchor on same page
    links = [link for link in links if not link.startswith('#')]

    print("Found: " + str(len(links)) + " links")
    i = 1
    a=[]
    
    # Traverse each link to find article and save it.
    for link in links:
        
       if "http://www.geeksforgeeks.org/" in link: 
        try:
            if i % 10 == 0:
                sleep(5)  # Sleep for 5 seconds before scraping every 10th link
            link = link.strip()
            print("Scraping link no: " + str(i) + " Link: " + link)
            i += 1
            link_soup = BeautifulSoup(requests.get(link).text)
            # Remove the space occupied by Google Ads (Drop script & ins node)
            [script.extract() for script in link_soup(["script", "ins"])]
            for code_tag in link_soup.find_all('pre'):
                code_tag['class'] = code_tag.get('class', []) + ['prettyprint']
            article = link_soup.find('article')
            # Now add this article to list of all articles
           
            page = Article(title=link_soup.title.string, content=article.encode('UTF-8'))
            articles.append(page)
        # Sometimes hanging. So Ctrl ^ C, and try the next link.
        # Find out the reason & improve this.
        except KeyboardInterrupt:
            continue
        except ConnectionError:
            print("Internet disconnected! Please check your connection & Try again.")
            if articles:
                print("Making PDF of links scraped till now.")
                break
            else:
                exit(1)


if __name__ == '__main__':
    display_menu()
    category_url = get_category_choice()
    scrape_category(category_url)
    save_articles_as_html_and_pdf()

Choose category to scrape: 
1. C Language
2. C++ Language
3. Java
4. Python
5. Algorithms
6. Data Structures
http://www.geeksforgeeks.org/c-plus-plus
Found: 478 links
Scraping link no: 1 Link: http://www.geeksforgeeks.org/setting-c-development-environment/
Scraping link no: 2 Link: http://www.geeksforgeeks.org/writing-first-c-program-hello-world-example/
Scraping link no: 3 Link: http://www.geeksforgeeks.org/fine-write-void-main-cc/
Scraping link no: 4 Link: http://www.geeksforgeeks.org/c-data-types/
Scraping link no: 5 Link: http://www.geeksforgeeks.org/basic-input-output-c/
Scraping link no: 6 Link: http://www.geeksforgeeks.org/cc-preprocessors/
Scraping link no: 7 Link: http://www.geeksforgeeks.org/operators-c-c/
Scraping link no: 8 Link: http://www.geeksforgeeks.org/loops-in-c
Scraping link no: 9 Link: http://www.geeksforgeeks.org/decision-making-c-c-else-nested-else/
Scraping link no: 10 Link: http://www.geeksforgeeks.org/execute-else-statements-cc-simultaneously/
Scraping link no

In [0]:

!ls
