# Pipulate Functions: Stuff you can do from a Google Sheet

In [None]:
"""Functions for conducting SEO and other data investigations."""

In [None]:
import requests, re, sys, os         #Modules that should be available to ALL pipulate functions
from collections import namedtuple   #Allows creation of a highly readable response object API for functions
from html.parser import HTMLParser   #Needed for MLStripper to inherit from
from urllib.parse import urlparse    #Very commonly used for getting the parts out of a URL
import notebook_finder               #Allows importing of other .ipynb files as if they were .py file modules
import goodsheet                     #Provides OAuth2 login for Google Sheets and other Google Services

## Define standard for Pipulate functions to return their values

In [None]:
Response = namedtuple('Response', 'ok status_code text') #Standard API for pipulate response object

## Pipulate funcs return Response(ok=True, status_code='200', text='string')

In [None]:
def foo(**kwargs):
    """This is a bare-bones copy-and-paste example for new Pipulate functions.
    
    Functions in this file can be used as column names in Google Sheets.
    Text output from these functions get inserted into the spreadsheet.
    Pipulate functions must have **kwargs unless preceded by a decorator.
    Decorators and support-functions significantly simplify these functions.
    The minimum a Pipulate function must return is this 3-element tuple."""
    return Response(ok=True, status_code='200', text='bar')

In [None]:
def url(passed_in_func):
    """This is the frequently-used decorator function that passes along pre-fetched HTML.
    
    This decorator function allows you to use @url above any Pipulate function.
    This in turn allows "html" to be used as the argument instead of **kwargs.
    Swapping out full HTML for a URL is great for screen-scraping functions like title.
    This is also very efficient, because all URL fetches are cached in a database.
    This means that multiple scraper functions can be used together efficiently.
    This function must appear in this file before anything decorated by it."""
    def requests_wrapper(**row_dict):
        html = row_dict['response'].text
        return passed_in_func(html=html)
    return requests_wrapper

In [None]:
@url
def Title(html):
    """This is the quintessential example of grabbing a title tag from a URL.

    This function does not need **kwargs as it's argument BECAUSE it's decorated.
    The @url decorator pre-filters **kwargs and sends along only the pre-fetched html.
    The HTML is cached for efficiency on subsequent calls of the same URL.
    This pattern can be used for extracting any TEXT NODE from HTML (not attributes).
    Pipulate functions are case insensitive, so this could also be 'title'."""
    return extract_text_node(html=html, tag='title')

In [None]:
def realurl(**row_dict):
    """Returns the actual URL retrieved after resolving all redirects."""
    url = row_dict['url']
    try:
        response = requests.get(url)
        return Response(ok=True, status_code='200', text=response.url)
    except:
        return Response(ok=True, status_code='200', text=response.status_code)

In [None]:
def apexdomain(**row_dict):
    """Usually returns the apex or registered domain, given an URL."""
    path = row_dict['url']
    if path:
        apex = urlparse(path).hostname.split(".")
        try:
            apex = ".".join(len(apex[-2]) < 4 and apex[-3:] or apex[-2:])
            return Response(ok=True, status_code='200', text=apex)
        except:
            return Response(ok=True, status_code='200', text="Can't find")
    else:
        return Response(ok=True, status_code='400', text='No input')

In [None]:
def gaorganic(**kwargs):
    path = urlparse(kwargs['url']).path
    path = "ga:pagePath==%s" % path.replace(",", "\,")
    service = create_google_service(api_name="analytics", version="v3")
    ga_request = service.data().ga().get(
        ids=kwargs['ids'],
        start_date=kwargs['startdate'],
        end_date=kwargs['enddate'],
        metrics='ga:organicSearches',
        dimensions='ga:pagePath',
        filters=path,
        start_index='1',
        max_results='100'
    )
    try:
        ga_response = ga_request.execute()
    except:
        return Response(ok=False, status_code='500', text="Did not execute")
    if ga_response and 'rows' in ga_response:
        return Response(ok=True, status_code='200', text=ga_response['rows'][0][1])
    else:
        return Response(ok=False, status_code='200', text='Not found')

## Populate() funcs return lists of lists [['A1','B1','C1'],['A2','B2','C2']]

In [None]:
def populate_from_gsc(site, start, end):
    service = create_google_service(api_name="webmasters", version="v3")
    listoflists = []
    units = 5000
    for aset in range(0, units*4, units):
        mydata = None
        request = {
            'startDate': str(start),
            'endDate': str(end),
            'dimensions': ['query', 'page'],
            'rowLimit': str(units),
            'startRow': str(aset)
        }
        mydata = service.searchanalytics().query(siteUrl=site, body=request).execute()
        if mydata and 'rows' in mydata:
            listoflists.append(mydata['rows'])
        else:
            break
    too_many_dicts = sum(listoflists, [])
    list_of_rows = []
    list_of_rows.append(['keyword', 'url', 'position', 'impressions', 'clicks', 'ctr'])
    for index, item in enumerate(too_many_dicts):
        row = [item['keys'][0], item['keys'][1], item['position'], 
               item['impressions'], item['clicks'], item['ctr']]
        list_of_rows.append(row)    
    return list_of_rows

In [None]:
def populate_from_ga(prepath, profileid, start, end):
    """Experiment at https://ga-dev-tools.appspot.com/query-explorer/"""
    service = create_google_service(api_name="analytics", version="v3")
    ga_request = service.data().ga().get(
        ids=profileid,
        start_date=start,
        end_date=end,
        metrics='ga:organicSearches,ga:sessions,ga:bounces',
        dimensions='ga:pagePath',
        sort='-ga:organicSearches,-ga:sessions',
        filters='ga:organicSearches>0',
        samplingLevel='HIGHER_PRECISION',
        start_index='1',
        max_results='10000'
    )
    try:
        ga_response = ga_request.execute()
    except:
        return [['ERROR']]
    if 'rows' in ga_response:
        raw_rows = ga_response['rows']
        list_of_lists = [['url', 'organicsearches','sessions','bounces']]+[[prepath+a,b,c,d] for a,b,c,d in raw_rows]
        return list_of_lists
    else:
        return [['ERROR']]

In [None]:
def one_page_crawl(url, regex=None):
    """Returns a set of all site URLs found on a page, optionally filtered."""
    from bs4 import BeautifulSoup
    try:
        html = requests.get(url).text
    except:
        return [['ERROR']]
    lookfor = urlparse(url).hostname
    soup = BeautifulSoup(html, "lxml")
    all_links = soup.find_all("a")
    nodupes = set()
    for link in all_links:
        href = link.get("href")
        if (type(href).__name__ == 'str' and len(href) > len(lookfor)
            and lookfor in href and href not in nodupes):
            if regex:
                match = re.search(regex, href, flags=re.I)
                if match:
                    nodupes.add(href)
            elif not regex:
                nodupes.add(href)
    if nodupes:
        columns = [['URL']]
        list_of_lists = [[x] for x in nodupes]
        return columns + list_of_lists
    else:
        return [['ERROR']]

## Support Functions that are not called from sheets or workflows

In [None]:
def extract_text_node(html, tag):
    """This is the helper function that extracts text-nodes like title tags from HTML.
    
    This function is for simple text-node screen scraping, such as for the Title tag.
    It is different from many support functions in that the return values get wrapped
    in the same Response() named tuple as functions that are meant to be called from 
    spreadsheets directly (like Title), so that those can be short and clear like this:
    
        @url
        def Title(html):
            return extract_text_node(html=html, tag="title")
    """
    
    if not html or not tag:
        return Response(ok=False, status_code='400', text=None)
    pattern = r'<{0}\s?>(.*?)</{0}\s?>'.format(tag.lower())
    compiled = re.compile(pattern=pattern, flags=re.DOTALL)
    matches = compiled.findall(string=html)
    if matches:
        text = matches[0].strip()
        return Response(ok=True, status_code='200', text=text)
    else:
        return Response(ok=True, status_code='200', text=None)

In [None]:
def create_google_service(api_name, version):
    """This lets you create instances of Google Services."""
    import httplib2
    from oauth2client import file, tools
    from apiclient.discovery import build
    path = os.path.dirname(os.path.realpath('__file__'))
    path_filename = os.path.join(path, goodsheet.filename)
    storage = file.Storage(path_filename)
    credentials = storage.get()
    http = credentials.authorize(http = httplib2.Http())
    service = build(api_name, version, http)
    return service

In [None]:
class MLStripper(HTMLParser):
    """http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python"""
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

    
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


def normalize_whitespace(string):
    return re.sub('[\s\r\n]+', ' ', string)