# SEO Content Audit 

### Initial Setup 
- This script uses the GSC API and runs a Screaming Frog crawl via the command line
- If you don't have your GSC API Key, watch this video from Jean Chouinard first: 
    - https://www.youtube.com/watch?v=-uy4L4P1Ujs&t=4s
- You will also need a paid subscription to Screaming Frog before proceeding  
- This code was built off June Tao Ching's GSC code and was modified to audit my content accordingly
    - Source: https://towardsdatascience.com/access-google-search-console-data-on-your-site-by-using-python-3c8c8079d6f8

### What does the code do?

- The following code runs a crawl on your website then pulls data from GSC's API and merges it together
- It pulls in data for X time range and compares that data to a previous time period (specified), then checks for the following:
    - Low Traffic Pages (< 50 Clicks)
    - Pages that lost traffic and saw a drop in average position
    - Low Hanging Fruit (Pages Ranking on Page 1 for their main keyword (by clicks), but not in the top 2 positions)
    - Striking Distance(Pages Ranking on Page 2 for their main keyword (by clicks))
    
    
    


In [31]:
import pickle
import pandas as pd 
import os

from datetime import datetime, timedelta
from google_auth_oauthlib.flow import InstalledAppFlow
from apiclient.discovery import build

class content_audit:

    def __init__(self,website,output_folder,start_date,end_date, prev_start_date, prev_end_date):

        self.website = website
        self.start_date = start_date
        self.end_date = end_date
        self.prev_start_date = prev_start_date
        self.prev_end_date = prev_end_date
        self.output_folder = output_folder
    

    def url_level_data(self):

        SITE_URL = self.website

        OAUTH_SCOPE = ('https://www.googleapis.com/auth/webmasters.readonly', 'https://www.googleapis.com/auth/webmasters')

        # Redirect URI for installed apps
        REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
        
        # You must edit gsc_credentials and pickled_credentials to include YOUR username
        gsc_credentials = r'your credentials '
        
        # where your pickled credential will be stored

        pickled_credentials = r'c:\users\your_username\desktop\pickled_credential'


        try:
            credentials = pickle.load(open(pickled_credentials  + ".pickle", "rb"))
        except (OSError, IOError) as e:
            flow = InstalledAppFlow.from_client_secrets_file(gsc_credentials, scopes=OAUTH_SCOPE)
            credentials = flow.run_console()
            pickle.dump(credentials, open(pickled_credentials  + ".pickle", "wb"))

            # Connect to Search Console Service using the credentials 
        webmasters_service = build('webmasters', 'v3', credentials=credentials)

        maxRows = 25000
        i = 0
        output_rows = []
        start_date = datetime.strptime(self.start_date, "%Y-%m-%d")
        end_date = datetime.strptime(self.end_date, "%Y-%m-%d")
        
        def date_range(start_date, end_date, delta=timedelta(days=1)):

            current_date = start_date
            while current_date <= end_date:
                yield current_date
                current_date += delta
        print('script start date:', start_date)

        for date in date_range(start_date, end_date):
            date = date.strftime("%Y-%m-%d")
            i = 0
            while True:

                request = {
                    'startDate' : date,
                    'endDate' : date,
                    'dimensions' : ["page"],
                    "searchType": "Web",
                    'rowLimit' : maxRows,
                    'startRow' : i * maxRows
                }

                response = webmasters_service.searchanalytics().query(siteUrl = SITE_URL, body=request).execute()
                if response is None:
                    break
                if 'rows' not in response:
                    break
                else:
                    for row in response['rows']:
                        page = row['keys'][0]
                        output_row = [page, row['clicks'], row['impressions'], row['position']]
                        output_rows.append(output_row)
                    i = i + 1
        print('script end date:', end_date)

        df = pd.DataFrame(output_rows, columns=['Address', 'URL Clicks', 'URL Impressions', 'URL Average Position'])
        df = df.groupby(['Address']).agg({'URL Clicks':'sum','URL Impressions':'sum','URL Average Position':'mean'}).reset_index()
        df['URL CTR'] = df['URL Clicks'] / df['URL Impressions'] 
        return df

    def prev_url_level_data(self):

        SITE_URL = self.website

        OAUTH_SCOPE = ('https://www.googleapis.com/auth/webmasters.readonly', 'https://www.googleapis.com/auth/webmasters')

        # Redirect URI for installed apps
        REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
        
        # You must edit gsc_credentials and pickled_credentials to include YOUR username
        gsc_credentials = r'your credentials '
        
        # where your pickled credential will be stored

        pickled_credentials = r'c:\users\your_username\desktop\pickled_credential'


        try:
            credentials = pickle.load(open(pickled_credentials  + ".pickle", "rb"))
        except (OSError, IOError) as e:
            flow = InstalledAppFlow.from_client_secrets_file(gsc_credentials, scopes=OAUTH_SCOPE)
            credentials = flow.run_console()
            pickle.dump(credentials, open(pickled_credentials  + ".pickle", "wb"))

            # Connect to Search Console Service using the credentials 
        webmasters_service = build('webmasters', 'v3', credentials=credentials)

        maxRows = 25000
        i = 0
        output_rows = []
        prev_start_date = datetime.strptime(self.prev_start_date, "%Y-%m-%d")
        prev_end_date = datetime.strptime(self.prev_end_date, "%Y-%m-%d")
        
        def date_range(start_date, end_date, delta=timedelta(days=1)):

            current_date = prev_start_date
            while current_date <= prev_end_date:
                yield current_date
                current_date += delta
        print('script start date:', prev_start_date)

        for date in date_range(prev_start_date, prev_end_date):
            date = date.strftime("%Y-%m-%d")
            i = 0
            while True:

                request = {
                    'startDate' : date,
                    'endDate' : date,
                    'dimensions' : ["page"],
                    "searchType": "Web",
                    'rowLimit' : maxRows,
                    'startRow' : i * maxRows
                }

                response = webmasters_service.searchanalytics().query(siteUrl = SITE_URL, body=request).execute()
                if response is None:
                    break
                if 'rows' not in response:
                    break
                else:
                    for row in response['rows']:
                        page = row['keys'][0]
                        output_row = [page, row['clicks'], row['impressions'], row['position']]
                        output_rows.append(output_row)
                    i = i + 1
        print('script end date:', prev_end_date)

        df = pd.DataFrame(output_rows, columns=['Address', 'URL Clicks Prev', 'URL Impressions Prev', 'URL Average Position Prev'])
        df = df.groupby(['Address']).agg({'URL Clicks Prev':'sum','URL Impressions Prev':'sum','URL Average Position Prev':'mean'}).reset_index()
        df['URL CTR Prev'] = df['URL Clicks Prev'] / df['URL Impressions Prev'] 
        return df    

    def gsc_kw(self):

        SITE_URL = self.website

        OAUTH_SCOPE = ('https://www.googleapis.com/auth/webmasters.readonly', 'https://www.googleapis.com/auth/webmasters')

        # Redirect URI for installed apps
        REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
        
        # You must edit gsc_credentials and pickled_credentials to include YOUR username
        gsc_credentials = r'your credentials '
        
        # where your pickled credential will be stored

        pickled_credentials = r'c:\users\your_username\desktop\pickled_credential'


        try:
            credentials = pickle.load(open(pickled_credentials  + ".pickle", "rb"))
        except (OSError, IOError) as e:
            flow = InstalledAppFlow.from_client_secrets_file(gsc_credentials, scopes=OAUTH_SCOPE)
            credentials = flow.run_console()
            pickle.dump(credentials, open(pickled_credentials  + ".pickle", "wb"))

            # Connect to Search Console Service using the credentials 
        webmasters_service = build('webmasters', 'v3', credentials=credentials)

        maxRows = 25000
        i = 0
        output_rows = []
        start_date = datetime.strptime(self.start_date, "%Y-%m-%d")
        end_date = datetime.strptime(self.end_date, "%Y-%m-%d")
        
        def date_range(start_date, end_date, delta=timedelta(days=1)):

            current_date = start_date
            while current_date <= end_date:
                yield current_date
                current_date += delta
        print('script start date:', start_date)

        for date in date_range(start_date, end_date):
            date = date.strftime("%Y-%m-%d")
            i = 0
            while True:

                request = {
                    'startDate' : date,
                    'endDate' : date,
                    'dimensions' : ["page",'query'],
                    "searchType": "Web",
                    'rowLimit' : maxRows,
                    'startRow' : i * maxRows
                }

                response = webmasters_service.searchanalytics().query(siteUrl = SITE_URL, body=request).execute()
                if response is None:
                    break
                if 'rows' not in response:
                    break
                else:
                    for row in response['rows']:
                        page = row['keys'][0]
                        keyword = row['keys'][1]
                        output_row = [ page,keyword, row['clicks'], row['impressions'], row['ctr'], row['position']]
                        output_rows.append(output_row)
                    i = i + 1
        print('script end date:', end_date)

        df = pd.DataFrame(output_rows, columns=['Address','Main Keyword', 'KW Clicks', 'KW Impressions', 'KW CTR',  'KW Average Position'])
        df = df.groupby(['Address','Main Keyword']).agg({'KW Clicks':'sum','KW Impressions':'sum','KW Average Position':'mean'}).reset_index()
        df['KW CTR'] = df['KW Clicks'] / df['KW Impressions'] 
        return df
   
    def screaming_frog_crawl(self):
        website = self.website
        output_folder = self.output_folder
        sf_command = os.system('cd "C:\Program Files (x86)\Screaming Frog SEO Spider" && ScreamingFrogSEOSpiderCli.exe --crawl {} --headless --output-folder {} --export-tabs "Internal:All"'\
            .format(website,output_folder))
    
    ### Method calls GSC API and SF Method then joins the data 
    def url_clean_up(self):
        url_data_now = self.url_level_data()
        url_prev_data = self.prev_url_level_data()
        sf_crawl = self.screaming_frog_crawl()
        keyword_data = self.gsc_kw()
        output_folder = self.output_folder
        
        url_data_gsc = url_data_now.merge(url_prev_data, how = 'left', on = 'Address')
        url_data_gsc[['URL Clicks','URL Impressions', 'URL Clicks Prev','URL Impressions Prev','URL Average Position','URL Average Position Prev']] = url_data_gsc[['URL Clicks','URL Impressions', 'URL Clicks Prev','URL Impressions Prev','URL Average Position','URL Average Position Prev']].fillna(0)
        url_data_gsc['URL Clicks Diff'] = url_data_gsc['URL Clicks'] -  url_data_gsc['URL Clicks Prev'] 
        url_data_gsc['URL Impressions Diff'] = url_data_gsc['URL Clicks'] -  url_data_gsc['URL Impressions Prev'] 
        url_data_gsc['URL Average Position Diff'] = url_data_gsc['URL Average Position Prev'] - url_data_gsc['URL Average Position']
        
        
        keyword_data = keyword_data.sort_values(by = 'KW Clicks', ascending = False).groupby(['Address']).head(1)
        
        df = pd.read_csv(output_folder + '\internal_all.csv') 
        df = df[df['Indexability'] == 'Indexable'][['Address','Title 1','H1-1','Status Code','Word Count']]
        df = df.merge(url_data_gsc, how = 'left', on = 'Address')
        df = df.merge(keyword_data, how = 'left', on = 'Address')
        df[['URL Clicks','URL Impressions', 'URL Clicks Prev','URL Impressions Prev','URL Average Position','URL Average Position Prev']] = df[['URL Clicks','URL Impressions', 'URL Clicks Prev','URL Impressions Prev','URL Average Position','URL Average Position Prev']].fillna(0)
        
        
        # Function checks for thin/low quality content, keywords that lost traffic, and keywords in striking distance
        
        def content_cleanup(df):
            clicks_url =  df['URL Clicks']
            impressions_url = df['URL Impressions']
            clicks_diff_url = df['URL Clicks Diff']
            avg_pos_diff_url = df['URL Average Position Diff']
            main_keyword = df['Main Keyword'] 
            main_keyword_rank = df['KW Average Position'] 
            
            word_count = df['Word Count']
            
            if clicks_url < 50 and impressions_url < 100 and word_count < 400:
                return 'Delete or No Index'
            elif clicks_url < 50 and impressions_url < 100 and word_count >= 400:
                return 'Re-evaluate Content (Low Traffic)'
            elif clicks_diff_url >= 100 and avg_pos_diff_url < 0:
                return 'URL Lost Traffic over 3 months - Re-Evaluate Content'        
            elif main_keyword_rank < 2 and main_keyword_rank > 0:
                return 'Main Keyword is Ranking in Position 1 (Leave as is)'        
            elif main_keyword_rank >= 2 and main_keyword_rank <= 10:
                return 'Low Hanging Fruit - Main Keyword is Ranking on Page 1'    
            elif main_keyword_rank >= 10 and main_keyword_rank <= 20:
                return 'Striking Distance Content - Main Keyword is Ranking on Page 2'            
        df['URL Action'] = df.apply(content_audit, axis = 1) 
        return df
    

In [None]:
# specify where you want to store your sf crawl
output_folder = r'C:\Users\your_username\Desktop\Competitors'


# Instantiate your class / Create Object 
# add in your website and date ranges
content_audit_seo = content_audit('yourwebsite', output_folder, start_date = '2022-08-01', end_date = '2022-10-31', prev_start_date = '2022-05-01', prev_end_date = '2022-07-31')

df = content_audit_seo.url_clean_up()