In [None]:
"""Functions for plugging into Pipulate-frameworks for conducting SEO investigations."""

In [None]:
import requests, re
from collections import namedtuple
from html.parser import HTMLParser
Response = namedtuple('Response', 'ok status_code text')

In [None]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [None]:
def url(passed_in_func):
    """Decorator for functions like Title to pre-extract html text."""
    def requests_wrapper(**row_dict):
        html = row_dict['response'].text
        return passed_in_func(html=html)
    return requests_wrapper

In [None]:
def text_node(html, tag):
    """Returns text node string for tags like Title. Simplfies common scraping functions."""
    pattern = r'<{0}\s?>(.*?)</{0}\s?>'.format(tag.lower())
    compiled = re.compile(pattern=pattern, flags=re.DOTALL)
    matches = compiled.findall(string=html)
    if matches:
        text = matches[0].strip()
        return Response(ok=True, status_code='200', text=text)
    else:
        return Response(ok=True, status_code='200', text=None)

In [None]:
@url
def breadcrumb(html):
    pattern = r'<ul id="breadcrumbs" class="breadcrumbs hide-for-small-only">(.*?)</ul>'.format("title")
    compiled = re.compile(pattern=pattern, flags=re.DOTALL)
    matches = compiled.findall(string=html)
    if matches:
        text = matches[0].strip()
    else:
        return Response(ok=True, status_code='200', text=None)
    path_list = strip_tags(text).split("\r\n        \r\n        ")
    trail = ''.join([x.strip() for x in path_list])
    return Response(ok=True, status_code='200', text=trail)

In [None]:
@url
def Title(html):
    return text_node(html=html, tag="title")

In [None]:
if __name__ == '__main__':
    test_url = 'http://mikelev.in/'
    response = requests.get(test_url)
    print(Title(url=test_url, response=response))