# SEO Notebook

In [1]:
"""A framework for conducting SEO investigations, using Google Spreadsheets for I/O."""

'A framework for conducting SEO investigations, using Google Spreadsheets for I/O.'

In [2]:
import notebook_finder       # Allows .ipynb files to be loaded like filename.py modules
import goodsheet, sys        # Handles OAuth login for all relevant Google services
from imp import reload       # Keeps in-memory functions reflecting your pipulate.ipynb edits
import shelve, requests      # For fetching & archiving HTTP responses to URL input-columns

In [3]:
google_sheet_name = 'Search Friendly URL Monitor'   # <-- set to name of File!!!
google_sheet_tab  = 'Sheet1'                        # <-- set to name of Tab!!!
google_sheet_args = 'Config' # A tab for OPTIONAL key/value pairs to feed more **kwargs to pipulate function
cache_html = True            # Whether to use previously locally cached version versions of hte URLs
rows_to_batch = 10           # Number of rows to process before "seeing" update in Google Sheets
pervasive_data = None        # Set to object that you want globally available (default = None)

In [4]:
if __name__ == '__main__':   # The entire Pipulate implementation (minus the functions) "lives" here
    import pipulate
    reload(pipulate)
    import pipulate          # 2nd import intentional (for notebook_finder nuances)
    print("You are logged in as %s" % pipulate.get_email())
    
    # All errors get this message, so check filename & tabname carefully & read tracebacks
    bad_api_message = "Can't reach Spreadsheet or nothing to Pipulate. Double-check & try again."
    chunk_limit = 0 # Make system only process x-chunks, useful while testing new functions
    arg_cells = None
    try:
        # Create the main connection to GSheets and gather some important sheet stats
        worksheet = goodsheet.connect.open(google_sheet_name).worksheet(google_sheet_tab)
        rows = worksheet.row_count
        cols = worksheet.col_count
        end_range = worksheet.get_addr_int(rows, cols)
        col_count = worksheet.col_count
        if google_sheet_args:
            arg_sheet = goodsheet.connect.open(google_sheet_name).worksheet(google_sheet_args)     
            arg_cells = arg_sheet.range("A1:B20")
    except:
        print(sys.exc_info()[0])
        raise SystemExit()
    
    # If we're using optional key/value pairs for global arguments, build the arg_dict
    arg_dict = {}
    temp_name = None
    if arg_cells:
        for an_arg in arg_cells:
            if an_arg.value:
                if an_arg.col == 1:
                    temp_name = an_arg.value
                else:
                    if an_arg.value:
                        arg_dict[temp_name] = an_arg.value
                    else:
                        arg_dict[temp_name] = None
                        temp_name = None
            else:
                break

    # Create dictionary of Pipulate functions invokable with their string-name keys
    pipulate_funcs = [x for x in dir(pipulate) if x[0] is not '_']
    func_dict = {x.lower():eval('pipulate.%s' % x) for x in pipulate_funcs}

    print('Examining "%s" in "%s"...' % (google_sheet_tab, google_sheet_name))
    
    # Row 1 is special; containing potential function-names needed later for row dict index
    row1_range = 'A1:%s' % worksheet.get_addr_int(1, col_count)
    cell_range = worksheet.range(row1_range)
    col_names = [x.value.lower() for x in cell_range]

    # To pick up where left off on long jobs, get first row with a blank cell
    list_of_rows = worksheet.get_all_values() #Expensive but worth it
    # An example of where we DON'T do list comprehensions, for readability's sake
    first_row_with_blank = rows #Inhibits processing unless valid blanks are found
    for row_dex, arow in enumerate(list_of_rows):
        for cell_dex, acell in enumerate(arow):
            if not acell and cell_range[cell_dex].value in func_dict: #Finds 1st valid empty cell
                first_row_with_blank = row_dex+1
                break
        else:
            continue
        break
    list_of_rows = None #Too big to keep around
    
    # Split spreadsheet into a series of ranges (which need to get into A2:B2 range notation).
    chunk_ranges = [(x+1, x+rows_to_batch+1) for x in list(range(rows-1)) if x%rows_to_batch == 0]
    unprocessed_chunks = [(x,y) for x,y in chunk_ranges if y > first_row_with_blank]
      
    # Every range chunk becomes its own chunk_range for batch updates.
    for chunk_dex, (row_start, row_end) in enumerate(unprocessed_chunks):
        if chunk_limit > 0 and chunk_dex >= chunk_limit:break
        
        # Create A2:B2 range notation for each chunk
        top_left = worksheet.get_addr_int(row_start+1, 1)
        lower_right = worksheet.get_addr_int(row_end, cols)
        range_string = "%s:%s" % (top_left, lower_right)
        
        # Calculate correct number of rows in last uneven chunk
        if chunk_dex+1 == len(chunk_ranges):
            range_string = "%s:%s" % (top_left, end_range)
        print("Pipulating range %s of %s (%s)" % (chunk_dex+1, len(unprocessed_chunks), range_string))
        print("%s updating in: " % range_string, end="")

        # We create a chunk_range for the chunk for both reading and writing-back values.
        try:
            chunk_range = worksheet.range(range_string)
        except:
            print(sys.exc_info()[0])
            break
            
        # Now, we step through each cell in the current chunk.
        row_dict = {} #This object is the "row memory" and key to the pipulation process
        count_down = int(len(chunk_range)/col_count)+1
        for cell_dex, acell in enumerate(chunk_range): #Working with cells, but interested in rows
            row, col, val = acell.row, acell.col, acell.value
            # Now we pipulate this row's row_dict with column-name/cell-value pairs
            row_dict[col_names[col-1]] = val
            if col%col_count == 0: #This is how we determine reaching last cell of row
                count_down = count_down - 1
                print("%s, " % count_down, end="")
                requests_response = None #HTML-cache object loop-leak prevention
                Pipulate_Response = None #Function-return loop-leak prevention
                # Why fetch the HTML for a URL more than once, if you don't have to?
                if 'url' in row_dict:
                    with shelve.open('urls') as urls:
                        if cache_html == True and row_dict['url'] in urls.keys():
                            requests_response = urls[row_dict['url']]
                        else:
                            try:
                                requests_response = requests.get(row_dict['url'])
                                urls[row_dict['url']] = requests_response #The moment of pickling
                            except requests.exceptions.RequestException as e:
                                print("(bad url) ", end="")
                                continue
                        # We now make the ENTIRE response object available to Pipulate functions.
                        row_dict['response'] = requests_response  # We now step through row to pipulate
                # If pervasive_data exists, we make it mutably available (for memory) to every row
                if pervasive_data:
                    row_dict['pervasive_data'] = pervasive_data
                for key, val in row_dict.items():
                    if not val: #Only process empty cells in function-named columns
                        if key in [x.lower() for x in dir(pipulate) if x[0] is not '_']:
                            if arg_dict:
                                row_dict = {**arg_dict, **row_dict}
                                if '' in row_dict:
                                    del row_dict['']
                                if 'response' in row_dict:
                                    del row_dict['response']
                            try:
                                Pipulate_Response = func_dict[key](**row_dict) #Pipulate!
                            except:
                                print("%s error in %s." % (sys.exc_info()[0], key))
                                continue
                            # We pipulate at end-of-row but update back to earlier cells on that same row.
                            row_start = cell_dex-cols+1
                            func_dex = col_names.index(key)
                            cell_to_update = row_start + func_dex
                            if Pipulate_Response.ok:
                                chunk_range[cell_to_update].value = Pipulate_Response.text #uncommitted
                            else:
                                chunk_range[cell_to_update].value = "Err: %s" % Pipulate_Response.status_code
                            row_dict = {} # Blank the now-used row_dict just for good measure
        try:
            # Batch update Google Sheets with the modified chunk_range.
            worksheet.update_cells(chunk_range)
            print("Range updated!")
        except:
            print(bad_api_message)
    print("Pipulation complete!") #do a little dance
    worksheet = None

You are logged in as mike_levin@ziffdavis.com
Examining "Sheet1" in "Search Friendly URL Monitor"...
Pipulating range 1 of 4 (A2:R11)
A2:R11 updating in: 10, (bad url) 9, 8, 7, 6, 5, 4, 3, 2, 1, (bad url) Range updated!
Pipulating range 2 of 4 (A12:R21)
A12:R21 updating in: 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 3 of 4 (A22:R31)
A22:R31 updating in: 10, 9, 8, (bad url) 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 4 of 4 (A32:R34)
A32:R34 updating in: 3, 2, 1, Range updated!
Pipulation complete!
