# Check if your Main / Primary Keyword is not in your Title or H-1

### Initial Setup 
- This script uses the GSC API and runs a Screaming Frog Crawl via the command line
- If you don't have your GSC API Key, watch this video from Jean Chouinard first: https://www.youtube.com/watch?v=-uy4L4P1Ujs&t=4s
- You will also need a paid subscription to Screaming Frog before proceeding 
- This code was built off June Tao Ching's GSC code and was modified to audit my content accordingly
    - Source: https://towardsdatascience.com/access-google-search-console-data-on-your-site-by-using-python-3c8c8079d6f8

### What does the code do?

- The following code pulls keyword data from GSC and crawl data from screaming Frog 
- the purpose of running a crawl is to get H-1s, H-2s and Title tags for each URL
- then it filters for only the top 5 keywords by clicks per URL 
- Lastly, it checks to see the percentage of each of the top 5 keywords in the title, H-1 and H-2
- if you have a large site, you should prioritize according to traffic or revenue potential

#### How to Analyze the Data?

- I've found that updating title tags can turn around traffic pretty quickly especially if my URL is already ranking on page 1 or 2. 
- If the main keyword is not found in my title, I can quickly make updates 
- This code will catch keyword percentage in titles or H-1s regardless of the ordering (i.e. cancer symptoms or symptoms cancer)
- you can then check titles that have less than a 75% keyword match (or whatever you see fit) and update your titles accordingly
- We pull the top 5 keywords by clicks instead of just the top 1-2, so we're covering all our bases

In [None]:
import pickle
import pandas as pd 
import os

from datetime import datetime, timedelta
from google_auth_oauthlib.flow import InstalledAppFlow
from apiclient.discovery import build

class keyword_check:

    def __init__(self,website,output_folder,start_date,end_date):

        self.website = website
        self.start_date = start_date
        self.end_date = end_date
        self.output_folder = output_folder
    

    def gsc_kw(self):

        SITE_URL = self.website

        OAUTH_SCOPE = ('https://www.googleapis.com/auth/webmasters.readonly', 'https://www.googleapis.com/auth/webmasters')

        # Redirect URI for installed apps
        REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
        
        # You must edit gsc_credentials and pickled_credentials to include YOUR username
        gsc_credentials = r'your credentials '
        
        # where your pickled credential will be stored

        pickled_credentials = r'c:\users\your_username\desktop\pickled_credential'


        try:
            credentials = pickle.load(open(pickled_credentials  + ".pickle", "rb"))
        except (OSError, IOError) as e:
            flow = InstalledAppFlow.from_client_secrets_file(gsc_credentials, scopes=OAUTH_SCOPE)
            credentials = flow.run_console()
            pickle.dump(credentials, open(pickled_credentials  + ".pickle", "wb"))

            # Connect to Search Console Service using the credentials 
        webmasters_service = build('webmasters', 'v3', credentials=credentials)

        maxRows = 25000
        i = 0
        output_rows = []
        start_date = datetime.strptime(self.start_date, "%Y-%m-%d")
        end_date = datetime.strptime(self.end_date, "%Y-%m-%d")
        
        def date_range(start_date, end_date, delta=timedelta(days=1)):

            current_date = start_date
            while current_date <= end_date:
                yield current_date
                current_date += delta
        print('script start date:', start_date)

        for date in date_range(start_date, end_date):
            date = date.strftime("%Y-%m-%d")
            i = 0
            while True:

                request = {
                    'startDate' : date,
                    'endDate' : date,
                    'dimensions' : ["page",'query'],
                    "searchType": "Web",
                    'rowLimit' : maxRows,
                    'startRow' : i * maxRows
                }

                response = webmasters_service.searchanalytics().query(siteUrl = SITE_URL, body=request).execute()
                if response is None:
                    break
                if 'rows' not in response:
                    break
                else:
                    for row in response['rows']:
                        page = row['keys'][0]
                        keyword = row['keys'][1]
                        output_row = [ page,keyword, row['clicks'], row['impressions'], row['ctr'], row['position']]
                        output_rows.append(output_row)
                    i = i + 1
        print('script end date:', end_date)

        df = pd.DataFrame(output_rows, columns=['Address','Keyword', 'Clicks', 'Impressions', 'CTR',  'Average Position'])
        df = df.groupby(['Address','Main Keyword']).agg({'Clicks':'sum','Impressions':'sum','Average Position':'mean'}).reset_index()
        df['CTR'] = df['Clicks'] / df['Impressions'] 
        return df
   
    def screaming_frog_crawl(self):
        website = self.website
        output_folder = self.output_folder
        sf_command = os.system('cd "C:\Program Files (x86)\Screaming Frog SEO Spider" && ScreamingFrogSEOSpiderCli.exe --crawl {} --headless --output-folder {} --export-tabs "Internal:All"'\
            .format(website,output_folder))
    
    def keyword_check(self):
        sf_crawl = self.screaming_frog_crawl()
        keyword_data = self.gsc_kw()
        output_folder = self.output_folder
        
        
        # Returns top 5 keywords by clicks per URL
        keyword_data = keyword_data.sort_values(by = 'Clicks', ascending = False).groupby(['Address']).head(5)
        
        df = pd.read_csv(output_folder + '\internal_all.csv') 
        df = df[df['Indexability'] == 'Indexable'][['Address','Title 1','H1-1','Status Code','Word Count']]
        
        
        df = df.merge(keyword_data, how = 'left', on = 'Address')

        
        def kw_check_percentage(df):
            title_header = df[0]
            keyword = df[1]
            title_header_data = set([string for string in re.split(' ',title_header.lower().replace('|','').replace('-','').replace(r':','').replace(',','').replace(r'r\n','').replace('&', '').replace(',', '').strip(' ')) if string != ""])
            kw_data = set([string for string in re.split(' ',keyword.lower().replace('|','').replace('-','').replace(r':','').replace(',','').replace(r'r\n','').replace('&', '').strip(' ')) if string != ""])

            return len(title_header_data.intersection(kw_data)) /len(kw_data)
            
        df['KW Percentage in Title'] = df[['Title 1','Keyword']]    
        df['KW Percentage in H1'] = df[['Title 1','H1-1']]  
        df['KW Percentage in H2'] = df[['Title 1','H2-1']]   
        return df

In [None]:
### Run the Code Below and Audit accordingly

In [None]:
kw_check = keyword_check(website  = 'https://yourwebsite.com',output_folder = 'your_output_folder'
                         ,start_date = '2022-10-01',end_date = '2022-10-31')

In [None]:
df = kw_check.keyword_check()