In [1]:
from collections import defaultdict
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://pu_dev.ingham.org/departments_and_officials/purchasing/bid_center.php"

In [3]:
res = requests.get(url)
soup = BeautifulSoup(res.text)


class Report:
    def __init__(self):
        self.links = []
        self.no_url = []
        self.todo = []
        self.relative = []
        self.domains = defaultdict(list)
        self.schemes = defaultdict(list)
        self.mailto = []

    def scanlink(self, link):
        url = None
        if "href" in link.attrs:
            url = link.attrs["href"]
        if url is None or url == "#":
            self.no_url.append(link)
            return False
        if "todo" in url:
            self.todo.append(link)
            return False
        if "../" in url:
            self.relative.append(link)
            return False
        if "mailto:" in url:
            self.mailto.append(link)
            return False

        self.links.append(link)
        up = urlparse(url)
        scheme = up.scheme
        domain = up.netloc
        self.domains[domain].append(link)
        self.schemes[scheme].append(link)
        return True


r = Report()
for link in soup.find_all("a"):
    r.scanlink(link)

In [4]:
r.no_url

[<a name="faq_rz660"></a>,
 <a name="faq_rz661"></a>,
 <a name="faq_rz662"></a>,
 <a name="faq_rz664"></a>]

In [5]:
r.todo

[]

In [6]:
r.mailto

[<a href="mailto:jbuckmaster@ingham.org" target="_blank">jbuckmaster@ingham.org</a>,
 <a href="mailto:packetresponse@ingham.org" target="_blank">packetresponse@ingham.org</a>,
 <a href="mailto:ingham-bids@grangerconstruction.com?subject=Request%20for%20Bid%20Opening%20and-or%20Results">Click Here to email a request for Bid Opening &amp; Results</a>,
 <a href="mailto:ingham-bids@grangerconstruction.com?subject=Request%20for%20Bid%20Opening%20and-or%20Results">Click Here to email a request for Bid Opening &amp; Results</a>,
 <a href="mailto:tom.shanley@kramermg.com">For Addendums<br/>contact Kramer<br/>Management<br/>Group, <br/>Tom Shanley</a>,
 <a href="mailto:tom.shanley@kramermg.com">tom.shanley@kramermg.com</a>,
 <a href="mailto:tom.shanley@kramermg.com?subject=Packet%20196-19">Contact<br/>Kramer Mgmt</a>]

In [7]:
for link in r.relative:
    print(link)
    print(link.parent)
    print()
    print()

<a href="departments_and_officials/purchasing/../index.php">Departments &amp; Officials</a>
<div id="breadcrumbs">
<a href="./">Home</a>   <a href="departments_and_officials/purchasing/../index.php">Departments &amp; Officials</a>   <a href="departments_and_officials/purchasing/index.php">Purchasing</a>   Bid Center
        </div>




In [8]:
r.schemes.keys()

dict_keys(['', 'http', 'https'])

In [9]:
r.schemes[""]

[<a href="departments_and_officials/purchasing/bid_center.php#main" id="skip" tabindex="0">Skip to content</a>,
 <a class="Afrikaans" data-lang="Afrikaans" href="departments_and_officials/purchasing/bid_center.php#">Afrikaans</a>,
 <a class="Albanian" data-lang="Albanian" href="departments_and_officials/purchasing/bid_center.php#">Albanian</a>,
 <a class="Arabic" data-lang="Arabic" href="departments_and_officials/purchasing/bid_center.php#">Arabic</a>,
 <a class="Armenian" data-lang="Armenian" href="departments_and_officials/purchasing/bid_center.php#">Armenian</a>,
 <a class="Azerbaijani" data-lang="Azerbaijani" href="departments_and_officials/purchasing/bid_center.php#">Azerbaijani</a>,
 <a class="Basque" data-lang="Basque" href="departments_and_officials/purchasing/bid_center.php#">Basque</a>,
 <a class="Belarusian" data-lang="Belarusian" href="departments_and_officials/purchasing/bid_center.php#">Belarusian</a>,
 <a class="Bengali" data-lang="Bengali" href="departments_and_official

In [10]:
r.schemes["http"]

[<a class="menuA level1 menuHidden" href="http://pe.ingham.org" target="_new">Human Resources</a>,
 <a class="menuA level1 menuHidden" href="http://va.ingham.org" target="_new">Veterans Affairs</a>,
 <a class="menuA level1 menuHidden" href="http://parks.ingham.org" target="_new">Ingham County Parks</a>,
 <a class="menuA level1 menuHidden" href="http://cms3.revize.com/revize/inghamfair/index.php" target="_new">Ingham County Fairgrounds</a>,
 <a class="menuA level0 menuDisplay" href="http://emergency.ingham.org" target="_new">Emergencies</a>,
 <a href="http://bc.ingham.org/Resolutions/tabid/2220/articleType/CategoryView/CategoryID/7/currentpage/3/Default.aspx" target="_blank">Resolution<br/>20-219</a>,
 <a href="http://bc.ingham.org/Resolutions/tabid/2220/articleType/ArchiveView/year/2019/Default.aspx">See Resolution<br/>19-029</a>,
 <a href="http://bc.ingham.org/Resolutions/tabid/2220/articleType/CategoryView/CategoryID/17/currentpage/8/Default.aspx">See Resolution<br/>19-044</a>,
 <a h

In [11]:
for item in r.domains.keys():
    print(f"{item.ljust(34,' ')}\t{len(r.domains[item])}")

                                  	141
pe.ingham.org                     	1
va.ingham.org                     	1
parks.ingham.org                  	1
cms3.revize.com                   	2
potterparkzoo.org                 	1
emergency.ingham.org              	1
apps.ingham.org                   	2
www.irs.gov                       	1
pu.ingham.org                     	1
inghamcnty.sharepoint.com         	464
inghamcnty-my.sharepoint.com      	4
bc.ingham.org                     	4
grangerconstruction-my.sharepoint.com	2
www.facebook.com                  	1
www.twitter.com                   	1
www.reddit.com                    	1


In [None]:
for link in r.domains[""]:
    print(link)
    # print(link.parent.parent)
    print()
    print()

In [None]:
st = True
for l in r.links:
    url = l.attrs["href"]
    if url == "departments_and_officials/purchasing/bid_center.php#":
        continue
    if url == "./":
        continue
    if st:
        if url != "departments_and_officials/purchasing/BidQuestions.php":
            continue
        else:
            st = False
    print(url)

In [27]:
s = {}

In [27]:
def check_link(url):
    url = "https://pu_dev.ingham.org/" + url
    if url in s:
        return s[url]
    print("checking " + url)
    res = requests.get(url)
    if not str(res.status_code).startswith("2"):
        s[url] = False
        return False
    s[url] = True
    return True

In [31]:
for u in [link.attrs["href"] for link in r.domains[""]]:
    if not check_link(u):
        print(u)