# SEO Notebook

In [1]:
"""A framework for conducting SEO investigations, using Google Spreadsheets for I/O."""

'A framework for conducting SEO investigations, using Google Spreadsheets for I/O.'

In [2]:
import notebook_finder    # Allows .ipynb files to be loaded like filename.py modules
import goodsheet          # Handles OAuth login for all relevant Google services
from imp import reload    # Keeps in-memory functions reflecting your pipulate.ipynb edits
import shelve, requests

In [3]:
rows_to_batch = 10
cache_html = True
google_sheet_name = 'Lookups'
google_sheet_tab  = 'Sheet1'

In [4]:
if __name__ == '__main__':
    import pipulate
    reload(pipulate)
    import pipulate # intentional
    bad_api_message   = "Can't reach Google APIs. Please try again later."
    worksheet = goodsheet.connect.open(google_sheet_name).worksheet(google_sheet_tab)
    pipulate_funcs = [x for x in dir(pipulate) if x[0] is not '_']
    
    # Create a dictionary of name-addressable functions from the Pipulate module.
    func_dict = {x.lower():eval('pipulate.%s' % x) for x in pipulate_funcs}

    # Test our connection to the spreadsheet and get its size.
    try:
        rows = worksheet.row_count
        cols = worksheet.col_count
        end_range = worksheet.get_addr_int(rows, cols)
        col_count = worksheet.col_count
    except:
        print(bad_api_message)
        raise SystemExit()

    # Row 1 contains column name values and may name pipulate functions.
    row1_range = 'A1:%s' % worksheet.get_addr_int(1, col_count)
    cell_range = worksheet.range(row1_range)
    col_names = [x.value.lower() for x in cell_range]

    # Split spreadsheet into a series of ranges (not yet in Excel-like A2:B10 notation).
    chunk_ranges = [(x+1, x+rows_to_batch) for x in list(range(rows)) if x%rows_to_batch == 0]
    
    # Every range chunk becomes its own chunk_range for batch updates.
    for chunk_dex, (row_start, row_end) in enumerate(chunk_ranges):
        # Create Excel-ranges for each chunk
        top_left = worksheet.get_addr_int(row_start+1, 1)
        lower_right = worksheet.get_addr_int(row_end+1, cols)
        range_string = "%s:%s" % (top_left, lower_right)
        # If it's the last chunk, beware odd number of rows!
        if chunk_dex+1 == len(chunk_ranges):
            range_string = "%s:%s" % (top_left, end_range)
        print("Pipulating chunk %s of %s (%s)..." % (len(chunk_ranges), range_string, chunk_dex+1))
        # We create a chunk_range for the chunk for both reading and writing-back values.
        try:
            chunk_range = worksheet.range(range_string)
        except:
            print(bad_api_message)
        row_dict = {}
        # Now, we step through each cell in the current chunk.
        for cell_dex, acell in enumerate(chunk_range):
            row, col, val = acell.row, acell.col, acell.value
            row_dict[col_names[col-1]] = val
            # Once you reach the last column of the current row, you are ready to Pipulate!
            if col%col_count == 0:
                print("Row: %s of %s" % (row, rows))
                response = None
                # Why fetch the HTML for a URL more than once, if you don't have to?
                if 'url' in row_dict:
                    with shelve.open('urls') as urls:
                        if cache_html == True and row_dict['url'] in urls.keys():
                            response = urls[row_dict['url']]
                        else:
                            try:
                                response = requests.get(row_dict['url'])
                                urls[row_dict['url']] = response
                            except requests.exceptions.RequestException as e:
                                print("HTTP request failed. Check URL.")
                                raise SystemExit()
                        # We make ENTIRE response object is made available to Pipulate functions.
                        row_dict['response'] = response
                # To pipulate a row, we step through the row_dict that now contains all data from row.
                for key, val in row_dict.items():
                    if not val: #empty cell
                        # Spot columns that use Pipulate function names.
                        if key in [x.lower() for x in dir(pipulate) if x[0] is not '_']:
                            try:
                                success_code, new_text = func_dict[key](**row_dict) # SHAZAM!
                            except:
                                print('Problem in Pipulate function: %s' % key)
                                raise SystemExit()
                            # Figure out which cell needs to be updated with the value we just produced.
                            row_start = cell_dex-cols+1
                            func_dex = col_names.index(key)
                            cell_to_update = row_start + func_dex
                            # Remember, we are no longer actually ON the cell that we're updating. 
                            chunk_range[cell_to_update].value = new_text
                            row_dict = {} # Blank it just for good measure
        try:
            # Batch update the chunk range.
            worksheet.update_cells(chunk_range)
            print("Range updated.")
        except:
            print(bad_api_message)
    print("Pipulation complete!")


Pipulating chunk 4 of A2:C11 (1)...
Row: 2 of 37
Row: 3 of 37
Row: 4 of 37
Row: 5 of 37
Row: 6 of 37
Row: 7 of 37
Row: 8 of 37
Row: 9 of 37
Row: 10 of 37
Row: 11 of 37
Range updated.
Pipulating chunk 4 of A12:C21 (2)...
Row: 12 of 37
Row: 13 of 37
Row: 14 of 37
Row: 15 of 37
Row: 16 of 37
Row: 17 of 37
Row: 18 of 37
Row: 19 of 37
Row: 20 of 37
Row: 21 of 37
Range updated.
Pipulating chunk 4 of A22:C31 (3)...
Row: 22 of 37
Row: 23 of 37
Row: 24 of 37
Row: 25 of 37
Row: 26 of 37
Row: 27 of 37
Row: 28 of 37
Row: 29 of 37
Row: 30 of 37
Row: 31 of 37
Range updated.
Pipulating chunk 4 of A32:C37 (4)...
Row: 32 of 37
Row: 33 of 37
Row: 34 of 37
Row: 35 of 37
Row: 36 of 37
Row: 37 of 37
Range updated.
Pipulation complete!
