# SEO Notebook

In [None]:
"""A framework for conducting SEO investigations, using Google Spreadsheets for I/O."""

In [None]:
import notebook_finder    # Allows .ipynb files to be loaded like filename.py modules
import goodsheet          # Handles OAuth login for all relevant Google services
from imp import reload    # Keeps in-memory functions reflecting your pipulate.ipynb edits
import shelve, requests

In [None]:
rows_to_batch = 100
cache_html = True
google_sheet_name = '20k'
google_sheet_tab  = '20k'

In [None]:
if __name__ == '__main__':
    import pipulate
    reload(pipulate)
    import pipulate # 2nd import intentional
    
    bad_api_message = "Can't reach Spreadsheet. Double-check file & tab name, or try again later."
    # Test our connection to the spreadsheet and get its size.
    try:
        worksheet = goodsheet.connect.open(google_sheet_name).worksheet(google_sheet_tab)
        rows = worksheet.row_count
        cols = worksheet.col_count
        end_range = worksheet.get_addr_int(rows, cols)
        col_count = worksheet.col_count
    except:
        print(bad_api_message)
        raise SystemExit()

    # Create a dictionary of name-addressable functions from the Pipulate module.
    pipulate_funcs = [x for x in dir(pipulate) if x[0] is not '_']
    func_dict = {x.lower():eval('pipulate.%s' % x) for x in pipulate_funcs}

    # Row 1 contains column name values and may name pipulate functions.
    row1_range = 'A1:%s' % worksheet.get_addr_int(1, col_count)
    cell_range = worksheet.range(row1_range)
    col_names = [x.value.lower() for x in cell_range]

    # To pick up where left off on long jobs, get first row with a blank cell
    all_cells = worksheet.get_all_values()
    first_row_with_blank = [(i,x) for i,x in enumerate(all_cells) if not x[2]][0][0]+1
    all_cells = None
    
    # Split spreadsheet into a series of ranges (not yet in Excel-like A2:B10 notation).
    chunk_ranges = [(x+1, x+rows_to_batch) for x in list(range(rows)) if x%rows_to_batch == 0]
    unprocessed_chunk_ranges = [(x,y) for x,y in chunk_ranges if y > first_row_with_blank]
    
    # Every range chunk becomes its own chunk_range for batch updates.
    for chunk_dex, (row_start, row_end) in enumerate(unprocessed_chunk_ranges):

        something_to_update = False # Toggled to True if any cell in range gets pipulated

        # Create Excel-ranges for each chunk
        top_left = worksheet.get_addr_int(row_start+1, 1)
        lower_right = worksheet.get_addr_int(row_end+1, cols)
        range_string = "%s:%s" % (top_left, lower_right)

        # If it's the last chunk, beware odd number of rows!
        if chunk_dex+1 == len(chunk_ranges):
            range_string = "%s:%s" % (top_left, end_range)
        print("Pipulating chunk %s of %s (%s)..." % (chunk_dex+1, len(chunk_ranges), range_string))

        # We create a chunk_range for the chunk for both reading and writing-back values.
        try:
            chunk_range = worksheet.range(range_string)
        except:
            print(bad_api_message)

        # Now, we step through each cell in the current chunk.
        row_dict = {} #pipulation object
        for cell_dex, acell in enumerate(chunk_range):
            row, col, val = acell.row, acell.col, acell.value
            # Populate row_dict with key/value pairs for every column of row.
            row_dict[col_names[col-1]] = val
            if col%col_count == 0: #last row of column / ready to pipulate!
                if something_to_update:
                    print("Row: %s of %s" % (row, rows))
                response = None
                # Why fetch the HTML for a URL more than once, if you don't have to?
                if 'url' in row_dict:
                    with shelve.open('urls') as urls:
                        if cache_html == True and row_dict['url'] in urls.keys():
                            response = urls[row_dict['url']]
                        else:
                            try:
                                response = requests.get(row_dict['url'])
                                urls[row_dict['url']] = response
                            except requests.exceptions.RequestException as e:
                                print("HTTP request failed. Check URL.")
                                raise SystemExit()
                        # We make ENTIRE response object is made available to Pipulate functions.
                        row_dict['response'] = response
                # To pipulate a row, we step through the row_dict that now contains all data from row.
                for key, val in row_dict.items():
                    if not val: #only pipulate empty cells
                        something_to_update = True
                        # Only process columns that use Pipulate function names.
                        if key in [x.lower() for x in dir(pipulate) if x[0] is not '_']:
                            try:
                                success_code, new_text = func_dict[key](**row_dict) # SHAZAM! Think about it.
                            except:
                                print('Problem in Pipulate function: %s' % key)
                                raise SystemExit()
                            # Remember, we are no longer actually ON the cell that we need to update, so we
                            # figure out which cell should updated with the value we just produced.
                            row_start = cell_dex-cols+1
                            func_dex = col_names.index(key)
                            cell_to_update = row_start + func_dex
                            chunk_range[cell_to_update].value = new_text #updates the in-memory object
                            row_dict = {} # Blank the now-used row_dict just for good measure
        try:
            # Batch update Google Sheets with the modified chunk_range.
            if something_to_update:
                worksheet.update_cells(chunk_range)
                print("Range updated.")
            else:
                print("Nothing pipulated in range.")
        except:
            print(bad_api_message)
    print("Pipulation complete!") #do a little dance