# SEO Notebook

In [1]:
"""A framework for conducting SEO investigations, using Google Spreadsheets for I/O."""

'A framework for conducting SEO investigations, using Google Spreadsheets for I/O.'

In [2]:
import notebook_finder    # Allows .ipynb files to be loaded like filename.py modules
import goodsheet          # Handles OAuth login for all relevant Google services
from imp import reload    # Keeps in-memory functions reflecting your pipulate.ipynb edits
import shelve, requests

In [3]:
rows_to_batch = 100
cache_html = True
google_sheet_name = '20k'
google_sheet_tab  = '20k'

In [4]:
if __name__ == '__main__':
    import pipulate
    reload(pipulate)
    import pipulate # 2nd import intentional
    
    bad_api_message = "Can't reach Spreadsheet. Double-check file & tab name, or try again later."
    # Test our connection to the spreadsheet and get its size.
    try:
        worksheet = goodsheet.connect.open(google_sheet_name).worksheet(google_sheet_tab)
        rows = worksheet.row_count
        cols = worksheet.col_count
        end_range = worksheet.get_addr_int(rows, cols)
        col_count = worksheet.col_count
    except:
        print(bad_api_message)
        raise SystemExit()

    # Create a dictionary of name-addressable functions from the Pipulate module.
    pipulate_funcs = [x for x in dir(pipulate) if x[0] is not '_']
    func_dict = {x.lower():eval('pipulate.%s' % x) for x in pipulate_funcs}

    print("Examining spreadsheet...")
    
    # Row 1 contains column name values and may name pipulate functions.
    row1_range = 'A1:%s' % worksheet.get_addr_int(1, col_count)
    cell_range = worksheet.range(row1_range)
    col_names = [x.value.lower() for x in cell_range]

    # To pick up where left off on long jobs, get first row with a blank cell
    first_row_with_blank = 1
    list_of_rows = worksheet.get_all_values()
    for row_dex, arow in enumerate(list_of_rows):
        for acell in arow:
            if not acell:
                first_row_with_blank = row_dex
                break
        else:
            continue
        break
    list_of_rows = None
    
    # Split spreadsheet into a series of ranges (which are not yet in Excel-like A2:B10 notation).
    chunk_ranges = [(x+1, x+rows_to_batch) for x in list(range(rows)) if x%rows_to_batch == 0]
    unprocessed_chunks = [(x,y) for x,y in chunk_ranges if y > first_row_with_blank]
    
    # Every range chunk becomes its own chunk_range for batch updates.
    for chunk_dex, (row_start, row_end) in enumerate(unprocessed_chunks):

        # Create Excel-ranges for each chunk
        top_left = worksheet.get_addr_int(row_start+1, 1)
        lower_right = worksheet.get_addr_int(row_end+1, cols)
        range_string = "%s:%s" % (top_left, lower_right)

        # If it's the last chunk, beware odd number of rows!
        if chunk_dex+1 == len(chunk_ranges):
            range_string = "%s:%s" % (top_left, end_range)
        print("Pipulating range %s of %s (%s)" % (chunk_dex+1, len(unprocessed_chunks), range_string))
        print("Countdown to update: ", end="")

        # We create a chunk_range for the chunk for both reading and writing-back values.
        try:
            chunk_range = worksheet.range(range_string)
        except:
            print(bad_api_message)
            raise SystemExit()

        # Now, we step through each cell in the current chunk.
        row_dict = {} #pipulation object
        count_down = int(len(chunk_range)/col_count)+1
        for cell_dex, acell in enumerate(chunk_range):
            row, col, val = acell.row, acell.col, acell.value
            # Populate row_dict with key/value pairs for every column of row.
            row_dict[col_names[col-1]] = val
            if col%col_count == 0: #last row of column / ready to pipulate!
                count_down = count_down - 1
                #print("%s Row: %s of %s" % (count_down, row, rows))
                print("%s, " % count_down, end="")
                response = None
                # Why fetch the HTML for a URL more than once, if you don't have to?
                if 'url' in row_dict:
                    with shelve.open('urls') as urls:
                        if cache_html == True and row_dict['url'] in urls.keys():
                            response = urls[row_dict['url']]
                        else:
                            try:
                                response = requests.get(row_dict['url'])
                                urls[row_dict['url']] = response
                            except requests.exceptions.RequestException as e:
                                print("HTTP request failed. Check URL.")
                                raise SystemExit()
                        # We make ENTIRE response object is made available to Pipulate functions.
                        row_dict['response'] = response
                # To pipulate a row, we step through the row_dict that now contains all data from row.
                for key, val in row_dict.items():
                    if not val: #only pipulate empty cells
                        # Only process columns that use Pipulate function names.
                        if key in [x.lower() for x in dir(pipulate) if x[0] is not '_']:
                            try:
                                success_code, new_text = func_dict[key](**row_dict) # SHAZAM! Think about it.
                            except:
                                print('Problem in Pipulate function: %s' % key)
                                raise SystemExit()
                            # Remember, we are no longer actually ON the cell that we need to update, so we
                            # figure out which of the row's cell should updated with this pipulate output.
                            row_start = cell_dex-cols+1
                            func_dex = col_names.index(key)
                            cell_to_update = row_start + func_dex
                            chunk_range[cell_to_update].value = new_text #updates the in-memory object
                            row_dict = {} # Blank the now-used row_dict just for good measure
        try:
            # Batch update Google Sheets with the modified chunk_range.
            worksheet.update_cells(chunk_range)
            print("Range updated!")
        except:
            print(bad_api_message)
    print("Pipulation complete!") #do a little dance
    worksheet = None

Examining spreadsheet...
Pipulating range 1 of 31 (A17002:C17101)
Countdown to update: 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 2 of 31 (A17102:C17201)
Countdown to update: 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, Range updated!
Pipulating range 3 of 31 (A17202:C173

SystemExit: 

To exit: use 'exit', 'quit', or Ctrl-D.
