# Crawler Web pages

In [1]:
# load dependency libraries
import requests
import os
import re
import pickle
from bs4 import BeautifulSoup
from collections import deque

In [2]:
domain = "uic.edu"
start_url = "https://cs.uic.edu"                

pages_folder = "../FetchedPages/"

# file extensions to ignore while crawling pages
ignore_ext = [
    '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.css', '.js',
    '.aspx', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.mp4',
    '.avi', '.tar', '.gz', '.tgz', '.zip'
]

crawl_limit = 6000

# to make sure error log file is initially empty
error_file = "../error_log.txt"
f = open(error_file, "w+")
# f.truncate()
f.close()

# queue to perform BFS web traversal
url_q = deque()
url_q.append(start_url)

# list to keep track of traversed URLs
urls_crawled = []
urls_crawled.append(start_url)

# dict to track pages fetched and stored in folder
pages_crawled = {}
page_no = 0

while url_q:
    
    try:
        url = url_q.popleft()               # fetch the first URL from the queue
        rqst = requests.get(url)            # get html code of web page

        if (rqst.status_code == 200):
            
            soup = BeautifulSoup(rqst.text, 'lxml')
            tags_extracted = soup.find_all('a')                 # extract all 'a' tags from page
            
            if len(tags_extracted) != 0:                        # to reject pages which don't link to another page
                pages_crawled[page_no] = url    

                output_file = pages_folder + str(page_no)

                os.makedirs(os.path.dirname(output_file), exist_ok=True)     # create file to store html code

                with open(output_file, "w", encoding="utf-8") as file:
                    file.write(rqst.text)
                file.close()

                for tag in tags_extracted:

                    link = tag.get('href')                  

                    if link is not None and link.startswith("http") and not any(ext in link.lower() for ext in ignore_ext):
                        
                        link = link.lower()

                        link = link.split('#')[0]
                        link = link.split('?', maxsplit=1)[0]
                        link = link.rstrip('/')
                        link = link.strip()

                        if link not in urls_crawled and domain in link:
#                             print(link)
                            url_q.append(link)                 # valid URL to append to the queue
                            urls_crawled.append(link)

                if (len(pages_crawled) > crawl_limit):
                    break                                       # stop crawling when reached limit

                page_no += 1
            

    except Exception as e:
        with open(error_file, "a+") as log:                  # add error message to error log
            log.write(f"Could not connect to {url}")
            log.write(f"\nError occured: {e}\n\n")
        file.close()

        print("Could not connect to ", url)
        print("Error occured: ", e, " \n")
        continue

Could not connect to  https://vcsaonline.uic.edu/lms
Error occured:  HTTPSConnectionPool(host='vcsaonline.uic.edu', port=443): Max retries exceeded with url: /lms (Caused by SSLError(SSLCertVerificationError("hostname 'vcsaonline.uic.edu' doesn't match 'dev.sa.uic.edu'")))  

Could not connect to  https://www.ready.uic.edu
Error occured:  HTTPSConnectionPool(host='www.ready.uic.edu', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError("hostname 'www.ready.uic.edu' doesn't match '*.accc.uic.edu'")))  

Could not connect to  https://vcsa.uic.edu
Error occured:  HTTPSConnectionPool(host='vcsa.uic.edu', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError("hostname 'vcsa.uic.edu' doesn't match '*.accc.uic.edu'")))  

Could not connect to  http://www.psych.uic.edu
Error occured:  HTTPSConnectionPool(host='www.psych.uic.edu', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError("bad handshake: Error(

Could not connect to  http://cbrl.lab.uic.edu
Error occured:  HTTPSConnectionPool(host='cbrl.lab.uic.edu', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))  

Could not connect to  http://liu.lab.uic.edu
Error occured:  HTTPSConnectionPool(host='liu.lab.uic.edu', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))  

Could not connect to  https://listserv.uic.edu/cgi-bin/wa
Error occured:  HTTPSConnectionPool(host='listserv.uic.edu', port=443): Max retries exceeded with url: /cgi-bin/wa (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))  

Could not connect to  https://web-travel-prod.cc.uic.edu/index.html
Error occured:  HTTPSConnectionPool(

In [8]:
pickle_folder = "../PickleFiles/"
os.makedirs(pickle_folder, exist_ok=True)

# Pickling the dict of crawled pages
with open(pickle_folder + '6000_pages_crawled.pickle', 'wb') as f:
    pickle.dump(pages_crawled,f)

In [9]:
len(pages_crawled)

6001

In [4]:
pickle_folder = "../PickleFiles/"
os.makedirs(pickle_folder, exist_ok=True)

In [5]:
with open(pickle_folder + '6000_pages_crawled.pickle', 'rb') as f:
    pages = pickle.load(f)

In [1]:
pages_crawled == pages

True