# SEO Notebook

In [1]:
"""A framework for conducting SEO investigations, using Google Spreadsheets for I/O."""

'A framework for conducting SEO investigations, using Google Spreadsheets for I/O.'

In [2]:
import notebook_finder      # Allows .ipynb files to be loaded like filename.py modules
import goodsheet            # Handles OAuth login for all relevant Google services
from imp import reload      # Keeps in-memory functions reflecting your pipulate.ipynb edits
import shelve, requests     # For fetching & archiving HTTP responses to URL input-columns

In [None]:
rows_to_batch = 10          # Number of rows to process before "seeing" update in Google Sheets
cache_html = True           # Whether to use previously locally cached version versions of hte URLs
google_sheet_name = '20k'   # Set to name of a Google Spreadsheet File you own
google_sheet_tab  = '1k'    # Set to name of the Tab in the Google Spreasheet you want to pipulate

In [None]:
if __name__ == '__main__':  # The entire Pipulate implementation (minus the functions) "lives" here
    import pipulate
    reload(pipulate)
    import pipulate         # 2nd import intentional (for notebook_finder nuances)
    
    # All errors get this message, so check filename & tabname carefully & read tracebacks
    bad_api_message = "Can't reach Spreadsheet or nothing to Pipulate. Double-check & try again."
    try:
        #Create the main connection to GSheets and gather some important sheet stats
        worksheet = goodsheet.connect.open(google_sheet_name).worksheet(google_sheet_tab)
        rows = worksheet.row_count
        cols = worksheet.col_count
        end_range = worksheet.get_addr_int(rows, cols)
        col_count = worksheet.col_count
    except:
        print(bad_api_message)
        raise SystemExit()

    # Create dictionary of Pipulate functions invokable with their string-name keys
    pipulate_funcs = [x for x in dir(pipulate) if x[0] is not '_']
    func_dict = {x.lower():eval('pipulate.%s' % x) for x in pipulate_funcs}

    print("Examining spreadsheet...")
    
    # Row 1 is special; containing potential function-names needed later for row dict index
    row1_range = 'A1:%s' % worksheet.get_addr_int(1, col_count)
    cell_range = worksheet.range(row1_range)
    col_names = [x.value.lower() for x in cell_range]

    # To pick up where left off on long jobs, get first row with a blank cell
    list_of_rows = worksheet.get_all_values() #Expensive but worth it
    # An example of where we DON'T do list comprehensions, for readability's sake
    first_row_with_blank = rows
    for row_dex, arow in enumerate(list_of_rows):
        for cell_dex, acell in enumerate(arow):
            if not acell and cell_range[cell_dex].value in func_dict: #Finds 1st valid empty cell
                first_row_with_blank = row_dex+1
                break
        else:
            continue
        break
    list_of_rows = None #Too big to keep around
    
    # Split spreadsheet into a series of ranges (which need to get into A2:B2 range notation).
    chunk_ranges = [(x+1, x+rows_to_batch+1) for x in list(range(rows-1)) if x%rows_to_batch == 0]
    unprocessed_chunks = [(x,y) for x,y in chunk_ranges if y > first_row_with_blank]
      
    # Every range chunk becomes its own chunk_range for batch updates.
    for chunk_dex, (row_start, row_end) in enumerate(unprocessed_chunks):

        # Create A2:B2 range notation for each chunk
        top_left = worksheet.get_addr_int(row_start+1, 1)
        lower_right = worksheet.get_addr_int(row_end, cols)
        range_string = "%s:%s" % (top_left, lower_right)
        
        # Calculate correct number of rows in last uneven chunk
        if chunk_dex+1 == len(chunk_ranges):
            range_string = "%s:%s" % (top_left, end_range)
        print("Pipulating range %s of %s (%s)" % (chunk_dex+1, len(unprocessed_chunks), range_string))
        print("Countdown to update: ", end="")

        print(range_string)
        # We create a chunk_range for the chunk for both reading and writing-back values.
        try:
            chunk_range = worksheet.range(range_string)
        except:
            print(bad_api_message)
            raise SystemExit()
            
        # Now, we step through each cell in the current chunk.
        row_dict = {} #This object is the "row memory" and key to the pipulation process
        count_down = int(len(chunk_range)/col_count)+1
        for cell_dex, acell in enumerate(chunk_range): #Working with cells, but interested in rows
            row, col, val = acell.row, acell.col, acell.value
            # Now we pipulate this row's row_dict with column-name/cell-value pairs
            row_dict[col_names[col-1]] = val
            if col%col_count == 0: #This is how we determine reaching last cell of row
                count_down = count_down - 1
                print("%s, " % count_down, end="")
                requests_response = None #HTML-cache object loop-leak prevention
                Pipulate_Response = None #Function-return loop-leak prevention
                # Why fetch the HTML for a URL more than once, if you don't have to?
                if 'url' in row_dict:
                    with shelve.open('urls') as urls:
                        if cache_html == True and row_dict['url'] in urls.keys():
                            requests_response = urls[row_dict['url']]
                        else:
                            try:
                                requests_response = requests.get(row_dict['url'])
                                urls[row_dict['url']] = requests_response #The moment of pickling
                            except requests.exceptions.RequestException as e:
                                print("HTTP request failed. Check URL.")
                                raise SystemExit()
                        # We now make the ENTIRE response object available to Pipulate functions.
                        row_dict['response'] = requests_response  # We now step through row to pipulate
                for key, val in row_dict.items():
                    if not val: #Only process empty cells in function-named columns
                        if key in [x.lower() for x in dir(pipulate) if x[0] is not '_']:
                            try:
                                Pipulate_Response = func_dict[key](**row_dict) #Pipulate!
                            except:
                                print('Problem in Pipulate function: %s' % key)
                                raise SystemExit()
                            # We pipulate at end-of-row but update back to earlier cells on that same row.
                            row_start = cell_dex-cols+1
                            func_dex = col_names.index(key)
                            cell_to_update = row_start + func_dex
                            if Pipulate_Response.ok:
                                chunk_range[cell_to_update].value = Pipulate_Response.text #uncommitted
                            else:
                                chunk_range[cell_to_update].value = "Err: " % Pipulate_Response.status_code
                            row_dict = {} # Blank the now-used row_dict just for good measure
        try:
            # Batch update Google Sheets with the modified chunk_range.
            worksheet.update_cells(chunk_range)
            print("Range updated!")
        except:
            print(bad_api_message)
    print("Pipulation complete!") #do a little dance
    worksheet = None

Examining spreadsheet...
Pipulating range 1 of 100 (A2:D11)
Countdown to update: A2:D11
10, 9, 8, 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 2 of 100 (A12:D21)
Countdown to update: A12:D21
10, 9, 8, 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 3 of 100 (A22:D31)
Countdown to update: A22:D31
10, 9, 8, 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 4 of 100 (A32:D41)
Countdown to update: A32:D41
10, 9, 8, 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 5 of 100 (A42:D51)
Countdown to update: A42:D51
10, 9, 8, 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 6 of 100 (A52:D61)
Countdown to update: A52:D61
10, 9, 8, 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 7 of 100 (A62:D71)
Countdown to update: A62:D71
10, 9, 8, 