<a href="https://colab.research.google.com/github/hamletbatista/weloveseo/blob/main/Automated_Duplicate_Content_Consolidation_with_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
access_token = "INPUT ONCRAWL API TOKEN"

Here are the main steps:

1. We will use OnCrawl’s crawler to collect all the product pages and their SEO meta data (including canonicals).
3. We will use SEMrush to gather color specific search terms and corresponding product pages.
4. We will define a simple clustering algorithm to group (or not group) products depending on whether they have color searches.
5. We can use Tableau to visualize the clustering changes and understand the changes better. 
6. We will upload our experimental changes to the Cloudflare CDN using the RankSense app.


#Oncrawl API

First, let's manually test the OnCrawl API

http://developer.oncrawl.com/#Authentication

https://github.com/cogniteev/oncrawl-documentation-platform/blob/master/pages/guides/guide_authentication.md


In [None]:
import requests


In [None]:
def list_projects():
  response = requests.get("https://app.oncrawl.com/api/v2/projects",
    headers={ 'Authorization': 'Bearer {ACCESS_TOKEN}'.format(ACCESS_TOKEN=access_token)})
  
  return response.json()


In [None]:
projects= list_projects()
projects

{'meta': {'filters': {}, 'limit': 20, 'offset': 0, 'sort': None, 'total': 1},
 'projects': [{'crawl_config_ids': ['5d002997451c956b65e28f55'],
   'crawl_ids': ['5d090005451c952d4952a59d',
    '5d081ba7451c956f8bf6989b',
    '5d002a74451c950e9b0e701e'],
   'crawl_over_crawl_ids': [],
   'domain': 'davidyurman.com',
   'features': ['advanced_project', 'custom_fields'],
   'id': '5d002997451c950e9b0e6ead',
   'is_verified_by': None,
   'last_crawl_created_at': 1560870917000,
   'last_crawl_id': '5d090005451c952d4952a59d',
   'limits': {'max_custom_chart_count': None,
    'max_custom_dashboard_count': None,
    'max_group_count': None,
    'max_segmentation_count': None,
    'max_speed': None},
   'log_monitoring_data_ready': False,
   'log_monitoring_processing_enabled': False,
   'log_monitoring_ready': False,
   'name': 'davidyurman',
   'settings': {},
   'shared_read_user_ids': [],
   'shared_write_user_ids': [],
   'start_url': 'https://www.davidyurman.com',
   'user_id': '5ce4099245

In [None]:
#http://developer.oncrawl.com/#Launch-a-crawl

def start_crawl(project_id, config_id):

  url = "https://app.oncrawl.com/api/v2/projects/{project_id}/launch-crawl?configId={crawl_config_id}".format(project_id=project_id, 
    crawl_config_id=config_id)
  
  print(url)

  bearer = 'Bearer {ACCESS_TOKEN}'.format(ACCESS_TOKEN=access_token)

  print(bearer)
  
  response = requests.post(url, headers={ 'Authorization': bearer})

  return response






In [None]:
projects["projects"][0]

{'crawl_config_ids': ['5d002997451c956b65e28f55'],
 'crawl_ids': ['5d090005451c952d4952a59d',
  '5d081ba7451c956f8bf6989b',
  '5d002a74451c950e9b0e701e'],
 'crawl_over_crawl_ids': [],
 'domain': 'davidyurman.com',
 'features': ['advanced_project', 'custom_fields'],
 'id': '5d002997451c950e9b0e6ead',
 'is_verified_by': None,
 'last_crawl_created_at': 1560870917000,
 'last_crawl_id': '5d090005451c952d4952a59d',
 'limits': {'max_custom_chart_count': None,
  'max_custom_dashboard_count': None,
  'max_group_count': None,
  'max_segmentation_count': None,
  'max_speed': None},
 'log_monitoring_data_ready': False,
 'log_monitoring_processing_enabled': False,
 'log_monitoring_ready': False,
 'name': 'davidyurman',
 'settings': {},
 'shared_read_user_ids': [],
 'shared_write_user_ids': [],
 'start_url': 'https://www.davidyurman.com',
 'user_id': '5ce40992451c954bf387a12c'}

In [None]:
project_id = projects["projects"][0]["id"]

In [None]:
config_id = projects["projects"][0]["crawl_config_ids"][0]

In [None]:
crawl = start_crawl(project_id, config_id)
crawl

In [None]:
crawl.json()

In [None]:
def list_crawls():
  response = requests.get("https://app.oncrawl.com/api/v2/crawls",
    headers={ 'Authorization': 'Bearer {ACCESS_TOKEN}'.format(ACCESS_TOKEN=access_token)})
  
  return response.json()


In [None]:
list_crawls()

{'crawls': [{'crawl_config': {'adobe_analytics_params': {},
    'agent_kind': 'web',
    'ajax_crawling': False,
    'allow_query_params': True,
    'alternate_start_urls': [],
    'at_internet_params': {},
    'crawl_subdomains': False,
    'custom_fields': [{'export': {'arity': 'ONE', 'type': 'STRING'},
      'name': 'Product_ID',
      'namespace': 'custom',
      'parse': [{'operations': [{'kind': 'REGEX',
          'rule': '<meta property="product:retailer_item_id" content="(.*)">'}],
        'source': 'PAGE_CONTENT'}],
      'transform': []},
     {'export': {'arity': 'ONE', 'type': 'STRING'},
      'name': 'Product_ID_reverse',
      'namespace': 'custom',
      'parse': [{'operations': [{'kind': 'REGEX',
          'rule': '<meta content=\\"(.*)\\" property=\\"product:retailer_item_id\\"/>'}],
        'source': 'PAGE_CONTENT'}],
      'transform': []}],
    'dns': [],
    'extra_headers': {},
    'filter_query_params': False,
    'google_analytics_params': {},
    'google_search

In [None]:
def get_crawl(crawl_id):
  response = requests.get("https://app.oncrawl.com/api/v2/crawls/{crawl_id}".format(crawl_id=crawl_id),
    headers={ 'Authorization': 'Bearer {ACCESS_TOKEN}'.format(ACCESS_TOKEN=access_token)})
  
  return response.json()  


In [None]:
get_crawl("5d090005451c952d4952a59d")

{'crawl': {'crawl_config': {'adobe_analytics_params': {},
   'agent_kind': 'web',
   'ajax_crawling': False,
   'allow_query_params': True,
   'alternate_start_urls': [],
   'at_internet_params': {},
   'crawl_subdomains': False,
   'custom_fields': [{'export': {'arity': 'ONE', 'type': 'STRING'},
     'name': 'Product_ID',
     'namespace': 'custom',
     'parse': [{'operations': [{'kind': 'REGEX',
         'rule': '<meta property="product:retailer_item_id" content="(.*)">'}],
       'source': 'PAGE_CONTENT'}],
     'transform': []},
    {'export': {'arity': 'ONE', 'type': 'STRING'},
     'name': 'Product_ID_reverse',
     'namespace': 'custom',
     'parse': [{'operations': [{'kind': 'REGEX',
         'rule': '<meta content=\\"(.*)\\" property=\\"product:retailer_item_id\\"/>'}],
       'source': 'PAGE_CONTENT'}],
     'transform': []}],
   'dns': [],
   'extra_headers': {},
   'filter_query_params': False,
   'google_analytics_params': {},
   'google_search_console_params': {},
   'h

##data API 
http://developer.oncrawl.com/#Data-API

Let's export our crawl

http://developer.oncrawl.com/#Export-Queries

In [None]:
def export_crawl(crawl_id, fields):
  response = requests.post("https://app.oncrawl.com/api/v2/data/crawl/{crawl_id}/pages?export".format(crawl_id=crawl_id),
    headers={ 'Authorization': 'Bearer {ACCESS_TOKEN}'.format(ACCESS_TOKEN=access_token) },
    json={
      "fields": fields
    }
  )

  return response



In [None]:
export = export_crawl("5d090005451c952d4952a59d", ["url", "canonical"])
export

<Response [422]>

In [None]:
export.json()

{'errors': {'_other': ['unknown field: canonical']}}

In [None]:
#if archived, need to restore it

In [None]:
export = export_crawl("5d090005451c952d4952a59d", ["url", "rel_canonical"])
export

<Response [200]>

In [None]:
export.json()["urls"]

[{'rel_canonical': 'https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-necklace-with-diamonds-4mm-kn1038-ss.pdp.html',
  'url': 'https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-necklace-with-diamonds-4mm-kn1038-ss.pdp.html'},
 {'rel_canonical': 'https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-ring-with-diamonds-4mm-kr1038-ss.pdp.html',
  'url': 'https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-ring-with-diamonds-4mm-kr1038-ss.pdp.html'},
 {'rel_canonical': 'https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-cross-necklace-with-diamonds-kn1044-ss.pdp.html',
  'url': 'https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-cross-necklace-with-diamonds-kn1044-ss.pdp.html'},
 {'rel_canonical': 'https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-dog-charm-necklace-in-18k-gold-with-diam.pdp.html',
  'url': 'https://www.davidyurman.com/products

In [None]:
export.json()["meta"]

{'columns': ['url', 'rel_canonical'],
 'in_cr': False,
 'total_hits': 2355,
 'total_pages': 236}

#Semrush API

Next, let's make API calls to SEMrush

https://www.semrush.com/api-analytics/



In [None]:
key="INPUT SEMrush API key"

In [None]:
from urllib.parse import urlencode, urlparse, urlunparse, quote
import pandas as pd

def get_seo_color_data(color, domain, database="us", export_columns="Ph,Po,Nq,Ur,Tg,Td,Ts", display_limit=10000, display_filter="+|Ph|Co|{color}"):
  
  global key
  
  url_params={"type": "domain_organic",
             "key": key,
              "display_filter": display_filter.format(color=color),
             "display_limit": display_limit,
             "export_columns": export_columns,
             "domain": domain,
             "database": database
             }

  api_url="https://api.semrush.com/"

  qs = urlencode(url_params)

  u = urlparse(api_url)

  api_request = urlunparse((u.scheme, u.netloc, u.path, u.params, qs, u.fragment))
  
  #print(api_request)
  
  r = requests.get(api_request)
  
  if r.status_code == 200:
    results = r.text.split("\r\n") #
    headers = results[0].split(";") # save result headers to list
    table = [x.split(";") for x in results[1:]] #save columns to list of lists
    
    df = pd.DataFrame(table, columns=headers).dropna() #remove null types
    
    return df
  
  else:
    print("API call failed with code {code}".format(r.status_code))
    
    return None

In [None]:
database="us"
domain="davidyurman.com"
color="white"

domain_df = get_seo_color_data(color, domain, export_columns="Ph,Po,Tg") # only keyword, position and traffic

#we explicitly convert numbers to integers to be able to perform arithmetic operations later
convert_dict = {'Keyword': str, 'Position': int, 'Traffic': int} 

domain_df = domain_df.astype(convert_dict)

In [None]:
domain_df

Unnamed: 0,Keyword,Position,Traffic
0,david yurman white gold bracelet,1,72
1,white gold,26,60
2,white gold pendant mens,7,40
3,mens white gold necklace,13,39
4,white david yurman bracelet,1,32
5,18k white gold box chain,1,32
6,david yurman white plains ny,1,24
7,white gold band rings,13,21
8,white gold jewelry,12,20
9,david yurman initial necklace white gold,1,16


#Technical Plan 

We are going to automate the advanced duplicate content technique discussed in this article https://www.searchenginejournal.com/advanced-duplicate-content-consolidation-python/314471/

Our plan:

We will trigger function calls by pushing messages to pub/sub messages

1. Write a Google Cloud function to export OnCrawl data and save it to a cloud storage bucket
2. Write another function to export SEMrush data and save it to another storage bucket
3. Use the datasets to generate canonical clusters and generate a Google Sheet
4. Publish the Google Sheet to the Cloudflare CDN





#OnCrawl Google Cloud function

Enable Cloud functions
https://console.cloud.google.com/apis/library/cloudfunctions.googleapis.com?pli=1

Enable Cloud Scheduler 
https://console.cloud.google.com/apis/library/cloudscheduler.googleapis.com

Schedule calls https://cloud.google.com/scheduler/


The other services we’ll need (Cloud Storage API, and Pub/Sub API) are already enabled.


In [None]:
!gcloud -v

Google Cloud SDK 259.0.0


To take a quick anonymous survey, run:
  $ gcloud alpha survey



In [None]:
#https://googleapis.dev/python/google-api-core/latest/auth.html
!gcloud auth application-default login


Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?code_challenge=2kGyNbfhgLT6-2V-x3iqC9Hp9nA11GcLOjXY8qn0Yc8&prompt=select_account&code_challenge_method=S256&access_type=offline&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth


Enter verification code: 4/rQE7lDkwcY9Dh9VYK_gmNmldyeZ2vegIoDm_OUCg5LTZDvDRE4DLVtk

Credentials saved to file: [/content/.config/application_default_credentials.json]

These credentials will be used by any library that requests
Application Default Credentials.

To generate an access token for other uses, run:
  gcloud auth application-default print-access-token


To take a quick anonymous survey, run:
  $ gcloud alpha survey



In [None]:
#!gcloud auth login --no-launch-browser

In [None]:
!gcloud config set project TYPE YOUR PROJECT NAME HERE

Updated property [core/project].


Create a Google Cloud Storage bucket Go to https://console.cloud.google.com/storage/browser

Set up the scheduled job Go to https://console.cloud.google.com/cloudscheduler

frequency: 0 9 * * 1 - Every Monday at 9am EST
topic: oncrawl_data
payload: all

https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules

Create the Google Cloud function
https://cloud.google.com/functions/docs/writing/#functions-writing-helloworld-background-python
https://cloud.google.com/functions/docs/writing/background'

https://console.cloud.google.com/functions/list?_ga=2.230322075.-688741304.1568838388&project=intent-prediction-bert&folder&organizationId=318124297531


#Google Cloud Storage

In [None]:
!pip install google-cloud-storage



This is an utility function to upload files to Google Cloud Storage from Python

In [None]:
#https://googleapis.dev/python/google-api-core/latest/auth.html
#https://pypi.org/project/google-cloud-storage/

def upload_to_bucket(bucket_name, filename, text):
    from google.cloud import storage
    
    client = storage.Client()
    bucket = client.get_bucket(bucket_name) 

    blob = bucket.blob(filename)
    blob.upload_from_string(text)
    
    print("file uploaded to bucket")


This is our OnCrawl Cloud Function that will be triggered by Pub/Sub messages

In [None]:
def oncrawl_data(event, context):
    """Background Cloud Function to be triggered by Pub/Sub.
    Args:
         event (dict):  The dictionary with data specific to this type of
         event. The `data` field contains the PubsubMessage message. The
         `attributes` field will contain custom attributes if there are any.
         context (google.cloud.functions.Context): The Cloud Functions event
         metadata. The `event_id` field contains the Pub/Sub message ID. The
         `timestamp` field contains the publish time.
    """
    import base64

    print("""This Function was triggered by messageId {} published at {}
    """.format(context.event_id, context.timestamp))

    if 'data' in event:
        name = base64.b64decode(event['data']).decode('utf-8')
    else:
        name = 'World'
        
    print('Hello {}!'.format(name))
    
    export = export_crawl("5d090005451c952d4952a59d", ["url", "rel_canonical"])
	
    #print(export.json()["urls"])
    text = export.json()["urls"]
    
    upload_to_bucket("ranksense_duplicate_clusters", 
                     "canonical_clusters.txt", text)
    
    
    

Another utility function to create CSVs from dictionaries

In [None]:
#https://stackoverflow.com/questions/3086973/how-do-i-convert-this-list-of-dictionaries-to-a-csv-file

def dicts_to_csv(dicts, csv_filename)
  import csv

  keys = dicts[0].keys()
  with open(csv_filename, 'wb') as output_file:
      dict_writer = csv.DictWriter(output_file, keys)
      dict_writer.writeheader()
      dict_writer.writerows(dicts)
      

In this function we hard code the OnCrawl access token. It is a better idea to set it up as an environment variable in the Google Cloud Functions Console

In [None]:
def export_crawl(crawl_id, fields):
    
    import requests
    
    access_token = "TYPE YOUR ONCRAWL ACCESS TOKEN" 
    
    response = requests.post("https://app.oncrawl.com/api/v2/data/crawl/{crawl_id}/pages?export".format(crawl_id=crawl_id),
                             headers={ 'Authorization': 'Bearer {ACCESS_TOKEN}'.format(ACCESS_TOKEN=access_token) },
                             json={
                                 "fields": fields
                             }
                            )

    return response


def upload_to_bucket(bucket_name, filename, dicts):
    
    from google.cloud import storage
    
    #save dictionaries locally
    #https://stackoverflow.com/questions/42719793/write-temporary-files-from-google-cloud-function
    dicts_to_csv(dicts, "/tmp/"+filename)
    
    client = storage.Client()
    bucket = client.get_bucket(bucket_name) 

    blob = bucket.blob(filename)
    #blob.upload_from_string(text)
    
    blob.upload_from_filename(filename="/tmp/"+filename)
    
    print("file uploaded to bucket")
    
    

def oncrawl_data(event, context):
    """Background Cloud Function to be triggered by Pub/Sub.
    Args:
         event (dict):  The dictionary with data specific to this type of
         event. The `data` field contains the PubsubMessage message. The
         `attributes` field will contain custom attributes if there are any.
         context (google.cloud.functions.Context): The Cloud Functions event
         metadata. The `event_id` field contains the Pub/Sub message ID. The
         `timestamp` field contains the publish time.
    """
    import base64

    print("""This Function was triggered by messageId {} published at {}
    """.format(context.event_id, context.timestamp))

    if 'data' in event:
        name = base64.b64decode(event['data']).decode('utf-8')
    else:
        name = 'World'
        
    print('Hello {}!'.format(name))
    
    export = export_crawl("5d090005451c952d4952a59d", ["url", "rel_canonical"])
	
    #print(export.json()["urls"])
    dicts = export.json()["urls"]
    
    upload_to_bucket("ranksense_duplicate_clusters", 
                     "canonical_clusters.txt", dicts)
    
    
    
    

In [None]:
#https://stackoverflow.com/questions/3086973/how-do-i-convert-this-list-of-dictionaries-to-a-csv-file
def dicts_to_csv(dicts, csv_filename):
    import csv

    keys = dicts[0].keys()
    
    #https://stackoverflow.com/questions/33054527/typeerror-a-bytes-like-object-is-required-not-str-when-writing-to-a-file-in
    with open(csv_filename, 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(dicts)

In [None]:
export.json()["urls"]

[{'rel_canonical': 'https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-necklace-with-diamonds-4mm-kn1038-ss.pdp.html',
  'url': 'https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-necklace-with-diamonds-4mm-kn1038-ss.pdp.html'},
 {'rel_canonical': 'https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-ring-with-diamonds-4mm-kr1038-ss.pdp.html',
  'url': 'https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-ring-with-diamonds-4mm-kr1038-ss.pdp.html'},
 {'rel_canonical': 'https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-cross-necklace-with-diamonds-kn1044-ss.pdp.html',
  'url': 'https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-cross-necklace-with-diamonds-kn1044-ss.pdp.html'},
 {'rel_canonical': 'https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-dog-charm-necklace-in-18k-gold-with-diam.pdp.html',
  'url': 'https://www.davidyurman.com/products

In [None]:
dicts_to_csv(export.json()["urls"], "test.csv")

In [None]:
key="INPUT SEMrush API key here"

#SEMRush Google Cloud function

Same here. Set up the SEMrush API key as an environment variable so there is no need to hard code it.

In [None]:


from urllib.parse import urlencode, urlparse, urlunparse, quote
import pandas as pd
import requests

def get_seo_color_data(color, domain, database="us", export_columns="Ph,Po,Nq,Ur,Tg,Td,Ts", 
                       display_limit=10000, display_filter="+|Ph|Co|{color}"):
  
  global key
  
  url_params={"type": "domain_organic",
             "key": key,
              "display_filter": display_filter.format(color=color),
             "display_limit": display_limit,
             "export_columns": export_columns,
             "domain": domain,
             "database": database
             }

  api_url="https://api.semrush.com/"

  qs = urlencode(url_params)

  u = urlparse(api_url)

  api_request = urlunparse((u.scheme, u.netloc, u.path, u.params, qs, u.fragment))
  
  #print(api_request)
  
  r = requests.get(api_request)
  
  if r.status_code == 200:
    results = r.text.split("\r\n") #
    headers = results[0].split(";") # save result headers to list
    table = [x.split(";") for x in results[1:]] #save columns to list of lists
    
    df = pd.DataFrame(table, columns=headers).dropna() #remove null types
    
    #https://stackoverflow.com/questions/29815129/pandas-dataframe-to-list-of-dictionaries

    return df.to_dict("records")
  
  else:
    print("API call failed with code {code}".format(r.status_code))
    
    return None

def upload_to_bucket(bucket_name, filename, dicts):
    
    from google.cloud import storage
    
    #save dictionaries locally
    #https://stackoverflow.com/questions/42719793/write-temporary-files-from-google-cloud-function
    dicts_to_csv(dicts, "/tmp/"+filename)
    
    client = storage.Client()
    bucket = client.get_bucket(bucket_name) 

    blob = bucket.blob(filename)
    #blob.upload_from_string(text)
    
    blob.upload_from_filename(filename="/tmp/"+filename)
    
    print("file uploaded to bucket")
    
    

def semrush_data(event, context):
    """Background Cloud Function to be triggered by Pub/Sub.
    Args:
         event (dict):  The dictionary with data specific to this type of
         event. The `data` field contains the PubsubMessage message. The
         `attributes` field will contain custom attributes if there are any.
         context (google.cloud.functions.Context): The Cloud Functions event
         metadata. The `event_id` field contains the Pub/Sub message ID. The
         `timestamp` field contains the publish time.
    """
    import base64

    print("""This Function was triggered by messageId {} published at {}
    """.format(context.event_id, context.timestamp))

    if 'data' in event:
        name = base64.b64decode(event['data']).decode('utf-8')
    else:
        name = 'World'
        
    print('Hello {}!'.format(name))
    
    database="us"
    domain="davidyurman.com"
    color="white"

    dicts = get_seo_color_data(color, domain, export_columns="Ph,Po,Tg") # only keyword, position and traffic
    
    upload_to_bucket("ranksense_duplicate_clusters", 
                     database+"_"+domain+"_"+ color + ".csv", dicts)
    
    
    
    

In [None]:
database="us"
domain="davidyurman.com"
color="white"

dicts = get_seo_color_data(color, domain, export_columns="Ph,Po,Tg") # only keyword, position and traffic


In [None]:
#dicts

#Clustering Cloud function

In [None]:
def download_from_bucket(bucket_name, filename):
    
    from google.cloud import storage
    
    client = storage.Client(project="intent-prediction-bert")
    bucket = client.get_bucket(bucket_name) 

    blob = bucket.get_blob(filename)

    blob.download_to_filename(filename=filename)

    print("file" + filename + " downloaded from bucket")

In [None]:
download_from_bucket("ranksense_duplicate_clusters", "canonical_clusters.txt")

filecanonical_clusters.txt downloaded from bucket


In [None]:
!cat canonical_clusters.txt

rel_canonical,url
https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-necklace-with-diamonds-4mm-kn1038-ss.pdp.html,https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-necklace-with-diamonds-4mm-kn1038-ss.pdp.html
https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-ring-with-diamonds-4mm-kr1038-ss.pdp.html,https://www.davidyurman.com/products/kids/kids-jewelry/albion-kids-ring-with-diamonds-4mm-kr1038-ss.pdp.html
https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-cross-necklace-with-diamonds-kn1044-ss.pdp.html,https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-cross-necklace-with-diamonds-kn1044-ss.pdp.html
https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-dog-charm-necklace-in-18k-gold-with-diam.pdp.html,https://www.davidyurman.com/products/kids/kids-jewelry/cable-collectibles-kids-dog-charm-necklace-in-18k-gold-with-diam.pdp.html
https://www.davidyurm

We need to process the existing cluster we downloaded

In [None]:
#load to dataframe

import pandas as pd

df = pd.read_csv("canonical_clusters.txt")

test_urls=list(df["url"])[:5]
print(test_urls)

Extract product ids to use for clustering

In [None]:
from urllib.parse import urlparse

import re

#THIS IS SITE SPECIFIC AND WON'T GENERALIZE TO OTHER SITES
def get_product_id(url):
  path = urlparse(url).path.replace(".pdp.html", "")
    
  words = path.split("-")

  for word in words:
    # match words that start with letters and are followed by numbers
    if re.match(r"^\D+\d+.*", word):
      return word
    
  return None



  

Next, this is how we can add the product ID column to our Dataframe and group the URLs to perform the clustering.

In [None]:
for url in test_urls:
  print(url)
  print(get_product_id(url))  


df["product_id"]=df["url"].apply(lambda x: get_product_id(x))

df.groupby("product_id").count()[["url", "rel_canonical"]]


We are going to fix that by adding self-referential canonicals to those URLs.

In [None]:
#
  df[df["rel_canonical"].isnull()]

  #output ->list of URls with no canonicals

  new_df = df.copy()

  #fix missing canonicals by setting them to self referential urls
  new_df["rel_canonical"] = new_df["rel_canonical"].fillna(new_df["url"])

  #this list should be empty
  new_df[new_df["rel_canonical"].isnull()]

  #export to CSV 

  new_df.to_csv("urls.csv")

  #download backup CSV from Google Colab to your local machine

  from google.colab import files

  files.download("urls.csv")



###Turning Canonical Clusters to Canonicalized




We are going to perform an intermediate step and force all product groups to canonicalize to the first URL in the group.

This is good enough to illustrate the concept, but for production use, we would want to canonicalize to the most popular URL in the group. It could be the most linked page or the one with the most search clicks or impressions.

In [None]:
first_canonical = dict()

def get_first_canonical(row):
  #print(row)
  product_id = row["product_id"]
  url = row["url"]
  
  canonical = first_canonical.get(product_id)
  
  if canonical == None:
    first_canonical[product_id] = url
    return url
  
  return canonical
  
#https://stackoverflow.com/questions/33518124/how-to-apply-a-function-on-every-row-on-a-dataframe
new_df.apply(get_first_canonical, axis=1)

next_df = new_df.copy()
next_df["rel_canonical"] = next_df.apply(get_first_canonical, axis=1)

next_df.to_csv("urls2.csv")

files.download("urls3.csv")


###Turning Some Canonical Clusters to Self-Referential


Now, in this final step, we will learn how many clusters should be self-referential.

As all groups canonicalize to one URL now, we only need to break those cluster where URLs have search traffic for color terms. We will change the canonicals to be self-referential.

In [None]:
from glob import glob

color_files = glob("davidyurman.com*")

#https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe

#import all color CSVs into a single pandas dataframe
li = []

for filename in color_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

color_df = pd.concat(li, axis=0, ignore_index=True)

#create a set of unique urls 
color_urls = set(color_df["URL"])

#override canonical if URL in color URL list (making it self referential)
last_df["rel_canonical"] = last_df.apply(lambda x: x["url"] if x["url"] in color_urls else x["rel_canonical"], axis=1 )

#export updated cluster urls
last_df.to_csv("urls3.csv")
files.download("urls3.csv")

#Exporting changes to Google Sheets


In [None]:
!pip install --upgrade -q gspread


In [None]:
# Authenticate (copy & paste key as detailed), and read spreadsheet
# (This is always confusing, but it works)
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

# get all data from the spreadsheet
worksheet = gc.open(spreadsheetName).sheet1
worksheetRows = worksheet.get_all_values() 

# Convert to a DataFrame and render. 
# (A DataFrame is overkill, but I wanted to play with them more :))
import pandas as pd
df = pd.DataFrame.from_records(worksheetRows)
df.rename(columns=df.iloc[0], inplace = True) # use 1st row as column names
df.drop([0], inplace = True) # drop the header row afterwards
df.iloc[:5] # show the first 5 rows to prove it's working

In [None]:
# helper function to update all rows in the spreadsheet with a function

def update_spreadsheet_rows(fieldName, parameterName, functionToCall, forceUpdate=False):
  # Go through spreadsheet, update column 'fieldName' with the data calculated 
  # by 'functionToCall(parameterName)'. Show a progressbar while doing so.
  # Only calculate / update rows without values there, unless forceUpdate=True.
  
  columnNr = df.columns.get_loc(fieldName) + 1 # column number of output field
  
  # progressbar maximum value = row count
  pbar = tqdm(total=df.shape[0]) 

  # go through each row
  for index, row in df.iterrows():
    # if we already did it, don't recalculate unless 'forceUpdate' is set.
    if forceUpdate or not row[fieldName]: 
      result = functionToCall(row[parameterName])
      row[fieldName] = result # save locally
      worksheet.update_cell(index+1, columnNr, result) # update sheet too
    pbar.update(1) # we did it!
  print("Done.") 
  # we did all of it! 

In [None]:
#TODO rewrite above code

# Publish Sheet to RankSense

You can find more details here https://help.ranksense.com/en/articles/4101798-how-to-publish-rules-using-our-api

In [None]:
#pip install requests
import requests

token = ""
requests.post(url='https://cf-app.ranksense.com/agile-seo/rules-api/import', data={
    "token": token,
    "sheet_link": "https://docs.google.com/spreadsheets/d/link",
    "website_id": 0,
    "tags": [0, 1, 2]
})

In [None]:
#https://cloud.google.com/functions/docs/calling/storage
#https://cloud.google.com/functions/docs/tutorials/storage

def bucket_data(data, context):
    """Background Cloud Function to be triggered by Cloud Storage.
       This generic function logs relevant data when a file is changed.

    Args:
        data (dict): The Cloud Functions event payload.
        context (google.cloud.functions.Context): Metadata of triggering event.
    Returns:
        None; the output is written to Stackdriver Logging
    """

    print('Event ID: {}'.format(context.event_id))
    print('Event type: {}'.format(context.event_type))
    print('Bucket: {}'.format(data['bucket']))
    print('File: {}'.format(data['name']))
    print('Metageneration: {}'.format(data['metageneration']))
    print('Created: {}'.format(data['timeCreated']))
    print('Updated: {}'.format(data['updated']))

    download_from_bucket(data['bucket'], data['name'])

