In [None]:
"""Functions for plugging into Pipulate-frameworks for conducting SEO investigations."""

In [None]:
import requests, re, os
from collections import namedtuple
from html.parser import HTMLParser
from datetime import date, time, datetime, timedelta
import notebook_finder
import goodsheet
import pandas as pd
Response = namedtuple('Response', 'ok status_code text') # Shape the standard pipulate response object

In [None]:
class make_date_markers():
    """Create a selection of dates that can be used as arguments to Analytics or Search Console"""
    def __init__(self):
        shift = 3
        today = date.today()
        day = today - timedelta(days=shift)
        week = day.isocalendar()[1]
        year = day.year
        first_day_this_month = day.replace(day=1)
        first_week_of_year = date(year,1,1)
        if(first_week_of_year.weekday()>3):
            first_week_of_year = first_week_of_year+timedelta(7-first_week_of_year.weekday())
        else:
            first_week_of_year = first_week_of_year - timedelta(first_week_of_year.weekday())
        prior_weeks_of_year = timedelta(days = (week-1)*7)
        week_start = first_week_of_year + prior_weeks_of_year
        week_end = first_week_of_year + prior_weeks_of_year + timedelta(days=6)
        week_start = week_start + timedelta(days=-1)
        week_end = week_end + timedelta(days=-1)
        month_end = first_day_this_month - timedelta(days=1)
        month_start = month_end.replace(day=1)
        month_days = (month_end - month_start).days+1
        start_90_days = day - timedelta(days=90) - timedelta(days=shift-1)
        end_90_days = day
        props = ['first_week_of_year',
                 'today','day',
                 'first_day_this_month',
                 'month_start','month_end',
                 'week_start','week_end',
                 'start_90_days','end_90_days']
        for prop in props:
            command = "self.%s = str(%s)" % (prop, prop)
            exec(command)

#dates = make_date_markers()
#['%s: %s' % (x, eval('dates.%s' % x)) for x in dir(dates) if x[0] != '_']

In [None]:
def create_google_service(filename, api_name, version):
    import httplib2
    from oauth2client import file, tools
    from apiclient.discovery import build
    path = os.path.dirname(os.path.realpath('__file__'))
    filename = '%s/%s.dat' % (path, filename)
    storage = file.Storage(filename)
    credentials = storage.get()
    http = credentials.authorize(http = httplib2.Http())
    service = build(api_name, version, http)
    return service

In [None]:
def search_console(site, start, end):
    service = create_google_service(filename="oauth", api_name="webmasters", version="v3")
    listoflists = []
    units = 5000
    for aset in range(0, units*4, units):
        mydata = None
        request = {
            'startDate': str(start),
            'endDate': str(end),
            'dimensions': ['query', 'page'],
            'rowLimit': str(units),
            'startRow': str(aset)
        }
        mydata = service.searchanalytics().query(siteUrl=site, body=request).execute()
        if mydata and 'rows' in mydata:
            listoflists.append(mydata['rows'])
        else:
            break
    return sum(listoflists, [])
#dates = make_date_markers()
#results = search_console(site='mikelev.in', start=dates.month_start, end=dates.month_end)
#print(results)

In [None]:
def correlate_timerange_averages(site):
    """Returns a 4-item list of search_console results, one for each date range."""
    from functools import reduce
    dates = make_date_markers()
    tupleo_dates = namedtuple('Dates', 'start end')
    date_list = []
    date_list.append(tupleo_dates(start=dates.day, end=dates.day))
    date_list.append(tupleo_dates(start=dates.week_start, end=dates.week_end))
    date_list.append(tupleo_dates(start=dates.month_start, end=dates.month_end))
    date_list.append(tupleo_dates(start=dates.start_90_days, end=dates.end_90_days))
    pre_baked = []
    for start, end in date_list:
        pre_baked.append(search_console(site, start, end))
    joinables = []
    for index, a_response in enumerate(pre_baked):
        joinables.append([(x['keys'][1],
                           x['keys'][0],
                           x['position'],
                           x['clicks'],
                           x['impressions'],
                           x['ctr']) for x in pre_baked[index]])
    list_of_frames = []
    column_suffix = ['dy', 'wk', 'mo', '90']
    for index, a_table in enumerate(joinables):
        list_of_frames.append(pd.DataFrame(data=a_table, columns=['url', 'keyword',
                               'position'+column_suffix[index],
                               'clicks'+column_suffix[index],
                               'impressions'+column_suffix[index],
                               'ctr'+column_suffix[index]]))
    joined_results = reduce(lambda l,r: pd.merge(l,r,on=['url', 'keyword'], how="outer"), list_of_frames)
    joined_results = joined_results[['url', 'keyword',
                                    'positiondy', 'positionwk', 'positionmo', 'position90',
                                    'clicksdy', 'clickswk', 'clicksmo', 'clicks90',
                                    'impressionsdy', 'impressionswk', 'impressionsmo', 'impressions90',
                                    'ctrdy', 'ctrwk', 'ctrmo', 'ctr90']]
    joined_results.sort_values('impressionsdy', axis=0, ascending=False, inplace=True)
    return joined_results

In [None]:
results = correlate_timerange_averages('mikelev.in')

In [None]:
results

## Decorator to simplify functions that need the HTML of a URL-fetch as an argument

In [None]:
def url(passed_in_func):
    """Decorator for functions like Title to pre-extract html text."""
    def requests_wrapper(**row_dict):
        html = row_dict['response'].text
        return passed_in_func(html=html)
    return requests_wrapper

## Generic all-purpose helper-functions to be called from within all other functions

In [None]:
class MLStripper(HTMLParser):
    """http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python"""
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [None]:
def normalize_whitespace(string):
    return re.sub('[\s\r\n]+', ' ', string)

In [None]:
def extract_text_node(html, tag):
    """Returns text node string for tags like Title. Simplifies common scraping functions."""
    pattern = r'<{0}\s?>(.*?)</{0}\s?>'.format(tag.lower())
    compiled = re.compile(pattern=pattern, flags=re.DOTALL)
    matches = compiled.findall(string=html)
    if matches:
        text = matches[0].strip()
        return Response(ok=True, status_code='200', text=text)
    else:
        return Response(ok=True, status_code='200', text=None)

## Pipulate Functions that can be used as column-labels of spreadsheets

In [None]:
@url
def Title(html):
    return extract_text_node(html=html, tag="title")

In [None]:
@url
def breadcrumb(html):
    pattern = '<ul .*?(class|id)="breadcrumb.*?>(?P<return>.*?)</ul>'.format("title")
    compiled = re.compile(pattern=pattern, flags=re.DOTALL)
    matches = compiled.search(string=html)
    if hasattr(matches, 'group'):
        text = matches.group('return')
    else:
        return Response(ok=True, status_code='200', text=None)
    scrubbed_text = normalize_whitespace(strip_tags(text))
    path_list = scrubbed_text.split("/")
    trail = '/'.join([x.strip() for x in path_list])
    return Response(ok=True, status_code='200', text=trail)

In [None]:
#if __name__ == '__main__':
    #test_url = 'http://mikelev.in/'
    #response = requests.get(test_url)
    #print(Title(url=test_url, response=response))
    #test_url = 'noyb'
    #response = requests.get(test_url)
    #print(breadcrumb(**{'response' : response}))