<h1>Speed Up Your Site Launch, Redesign, or Migration QA with Python</h1>

<p>When it comes to fast-paced agency life, both time and efficiency are crucial to tackling day to day tasks no matter how big or small. One of the most time consuming tasks that can me done in the SEO world is quality assurance of pages or sites when they launch. Wheter you are working with a small handfull of pages, or a full site launch, this is a breakdown of how to expedite the process of making sure your content is in the right place!</p>


In [1]:
# Required Libraries 
import re
import urllib
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm.notebook import tqdm

In [None]:
# Required External Resources:
# Screaming Frog
# *Optional* Google PageSpeed Insights API Key (https://developers.google.com/speed/docs/insights/v5/get-started)

<h2>What we will be covering</h2>
<h3>Crawling and Indexing</h3>
<br>
        <li>HTTPS</li>
        <li>Canonical Tags and Noindex Tags</li>
        <li>robots.txt</li>
        <li>XML Sitemap</li>

<h3>Meta Data</h3>
<br>
    <li>Titles</li>
    <li>Descriptions</li>

<h3>Site Speed</h3>
<br>
    <li>PageSpeed Score
    <li>First Contentful Paint</li>
    <li>Largest Contentful Paint</li>
    <li>Speed Index</li>
    <li>Time to Interactive</li>

<h3>(っ◔◡◔)っ ♥ Putting it all into a easy to digest report ♥</h3>


<h2>Let's talk about some of the functions we will be using</h2>

In [2]:
# Check for XML Sitemap
def sitemap_check(url):
    r = requests.get(url)
    if r.status_code == 200:
        sitemap = bs(r.content, 'lxml')
        links = [element.text for element in sitemap.find_all('loc')]
        return links
    else:
        print('No Sitemap Detected')

In [3]:
# Check for robots.txt file
def find_bot(domain):
    r = requests.get(domain)
    if r.status_code == 200:
        return r.text
    else:
        return('No Robots.txt found')

In [4]:
# Check live page for HTTPs
def https_check(url):
    check = re.search(r'(https)', url)
    if check is None:
        return 'HTTP'
    else:
        return 'HTTPS'

In [5]:
# Check to see if noindex tags are present on a page
def no_index_check(url):
    soup = bs(url.content, 'lxml')
    tag = soup.find_all('meta', {'name':'robots'})
    noindex = ''.join([r_tag.get('content') for r_tag in tag])
    if 'noindex' in noindex:
        return '<noindex> tag found'
    else:
        return '<noindex> tag not found'

In [6]:
# Check to see status of canonical tag, if it is there and if it is self referring or not
def canonical_check(url):
    canonical = ''.join([c_tag.get('href') for c_tag in bs(url.content, 'lxml').\
                         find_all('link', {'rel': 'canonical'})])
    if len(canonical) > 1: 
        if canonical == r.url:
            return 'Self-referring canonical'
        else:
            return 'Canonical found, not self-referring'
    else:
        return 'No canonical present'

In [7]:
def redirect_check(url, **kwargs):
    if len(re.findall(r'(\d)', (str(url.history)))) >= 1:
        return 'Redirect present'
    else:
        return 'No redirect present'

In [8]:
# The actual server call function
def speed_test_url(url, device):
    params = {
        "?url": url,
        'strategy': device,
        #'key': key (API key recommended for larger sites)
        }
    data = urllib.parse.urlencode(params, doseq=True)
    main_call = urllib.parse.urljoin(service_url, data)
    main_call = main_call.replace(r'%3F', r'?')
    return main_call

<h2>Crawling / Indexing</h2>

In [9]:
# Checking for presence of sitemap and robots.txt

sitemap_present = sitemap_check('https://aclion.com/sitemap.xml')
if len(sitemap_present) > 1:
    print('Sitemap is present!')
else:
    print('Sitemap is NOT present!!!')
    
robots_present = find_bot('https://aclion.com/robots.txt')

if len(robots_present) > 1:
    print('Robots.txt is present!')
    print(robots_present)
else:
    print('Robots is NOT present!!!')

Sitemap is present!
Robots.txt is present!

User-agent: *
Disallow: /wp-admin/
Allow: /wp-admin/admin-ajax.php



In [10]:
# QA for Crawling/Indexing properties

# Parse links from XML Sitemap
sitemap_list = [sitemap_check(i) for i in sitemap_check('https://www.aclion.com/sitemap.xml')]
frame = []
for n in range(0, len(sitemap_list)):
    for i in sitemap_list[n]:
        frame.append(i)

# Build DataFrame from XML Sitemap Links        
sitemap = pd.DataFrame(frame, columns=['Url'])

frame = []
for crawl in tqdm(sitemap.Url): 
    r = requests.get(crawl)
    data = {
        'Url': r.url,
        'Http Status': https_check(r.url),
        'Status Code': r.status_code,
        'Redirect Status': redirect_check(r),
        'Noindex Tag': no_index_check(r),
        'Canonical Status': canonical_check(r)
    }
    frame.append(data)
    
c_and_i_data = pd.DataFrame(frame) # Store Raw Data
c_and_i_overview = pd.concat([c_and_i_data.groupby(count).count()['Url'] for count in c_and_i_data.columns[1:]]) # Give an overview on page stats

HBox(children=(FloatProgress(value=0.0, max=286.0), HTML(value='')))




<h2>Meta Data</h2>

In [None]:
### Meta Data QA ###

''' 
Requirements: 

Screaming Frog export of site using export internal html

'''
# Pass a dictionary to rename columns for ease of code
columns = {'Meta Description 1': 'metaDescription', 
          'Meta Description 1 Length': 'descriptionLength', 
          'Meta Description 1 Pixel Width': 'descriptionPixelWidth',
          'Indexability Status': 'indexabilityStatus', 
          'Title 1': 'metaTitle', 
          'Title 1 Length': 'titleLength', 
          'Title 1 Pixel Width': 'titlePixelWidth'}

# Data Read-In & Formatting
df = pd.read_excel(r'internal_html_aclion.xlsx') # path to internal html SF pull
df = df[~df['Indexability'].str.contains('Non')]
df = df.rename(columns=columns)
df = df[['Address'] + [v[1] for v in columns.items()]]

In [None]:
## Check Meta Descriptions ##

# Over 160 characters (Description)
df['des_overOneSixty'] = df.descriptionLength.apply(lambda check: True if check >= 160 else False)
des_over = df[df['des_overOneSixty']==True] 

# Under 70 characters (Description)
df['des_underSeventy'] = df.descriptionLength.apply(lambda check: True if check <= 70 else False)
thinDescription = df[df['des_underSeventy']==True] 

# Missing Data (Description)
df['des_missingMeta'] = df['metaDescription'].isnull()
missingDescription = df[df['des_missingMeta']==True] 

#Duplicate Data (Description)
df_des = pd.DataFrame(df.groupby(df['metaDescription'].tolist()).size()).rename(columns={0: 'Occurance'})
df_des = df_des[df_des['Occurance'] > 1]
duplicate = df_des.index.tolist()
df['duplicateDescription'] = df['metaDescription'].apply(lambda check: True if check in duplicate else False)
dupDescription = df[df['duplicateDescription']==True] 

In [None]:
## Check Meta Titles ##

# Over 60 characters (Title)
df['tle_overSixty'] = df.titleLength.apply(lambda check: True if check >= 60 else False)
tle_over = df[df['tle_overSixty']==True] 

# Under 30 characters (Title)
df['tle_underThirty'] = df.titleLength.apply(lambda check: True if check <= 30 else False)
thinTitle = df[df['tle_underThirty']==True] 

# Missing Data (Title)
df['tle_missingMeta'] = df['metaTitle'].isnull()
missingTitle = df[df['tle_missingMeta']==True]

# Duplicate Data (Title)
df_tle = pd.DataFrame(df.groupby(df['metaTitle'].tolist()).size()).rename(columns={0: 'Occurance'})
df_tle = df_tle[df_tle['Occurance'] > 1]
duplicate = df_tle.index.tolist()
df['duplicateTitle'] = df['metaTitle'].apply(lambda check: True if check in duplicate else False)
dupTitle = df[df['duplicateTitle']==True] 

In [None]:
# Collect the results #

meta_data_results = {'Meta Tiltes over character Count':tle_over.shape[0],
                     'Thin Title':thinTitle.shape[0], 'Missing Titles':missingTitle.shape[0], 
                     'Duplicate Titles':dupTitle.shape[0],
                     'Description Over':des_over.shape[0],'Thin Description':thinDescription.shape[0],
                     'Missing Description':missingDescription.shape[0],
                     'Duplicate Description':dupDescription.shape[0]}

meta_data = pd.DataFrame(meta_data_results, index=[0])

<h2>Site Speed Checking</h2>

In [None]:
## Page Speed Testing ##

check = "captchaResult"
service_url = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed/"
#key = "api key"
diagnostics = []
seo = []

for crawl in tqdm(df['Address'][0:10].to_list()):
    # Call the PageSpeed Insights API 
    call = requests.get(speed_test_url(url=crawl, device='desktop'))
    response = call.json()
    # test to see if you get a valid response
    try:
        error = response['error']
        continue
    except KeyError as e:
        pass
    
    # Parse out web metrics from API response
    web_diagnostics = {
    'Page': crawl,
    'Number of Resources Requested': response["lighthouseResult"]['audits']['diagnostics']\
    ['details']['items'][0]['numRequests'], 
    'Page Size (MB)': round(response["lighthouseResult"]['audits']['diagnostics']\
                            ['details']['items'][0]['totalByteWeight']/1e+6, 2), 
    'Number of Scripts': response["lighthouseResult"]['audits']['diagnostics']\
    ['details']['items'][0]['numScripts'],
    'Number of Stylesheets': response["lighthouseResult"]['audits']['diagnostics']\
    ['details']['items'][0]['numStylesheets'],
    'Number of Fonts': response["lighthouseResult"]['audits']['diagnostics']\
    ['details']['items'][0]['numFonts'],
    'DOM Size': response["lighthouseResult"]['audits']['dom-size']['details']['items'][0]['value']}
    
    # Parse out speed metrics from API response
    seo_vitals = {
        'Page': crawl,
        'Google Score': int(response['lighthouseResult']['categories']['performance']['score']*100),
        'FCP': float(response["lighthouseResult"]['audits']['first-contentful-paint']['displayValue'].replace('\xa0s', '')), 
        'LCP': float(response["lighthouseResult"]['audits']['largest-contentful-paint']['displayValue'].replace('\xa0s', '')), 
        'FID': response["lighthouseResult"]['audits']['metrics']['details']['items'][0]['maxPotentialFID'], 
        'CLS': round(response["lighthouseResult"]['audits']['metrics']['details']['items'][0]['cumulativeLayoutShift'], 3),
        'Speed Index': float(response["lighthouseResult"]['audits']['speed-index']['displayValue'].replace('\xa0s', '')),
        'Time to Interactive': float(response["lighthouseResult"]['audits']['interactive']['displayValue'].replace('\xa0s', ''))
    }

    diagnostics.append(web_diagnostics)
    seo.append(seo_vitals)

In [None]:
with pd.ExcelWriter('site_launch_qa.xlsx') as writer:
    c_and_i_overview.to_excel(writer, sheet_name='Crawl&Index Overview')
    c_and_i_data.to_excel(writer, sheet_name='Crawl&Index Data', index=False)
    meta_data.to_excel(writer, sheet_name='Meta Data Overview', index=False)
    tle_over[['Address', 'metaTitle', 'titleLength']].to_excel(writer, sheet_name='Titles Over', index=False)
    thinTitle[['Address', 'metaTitle', 'titleLength']].to_excel(writer, sheet_name='Titles Under', index=False)
    missingTitle[['Address', 'metaTitle', 'titleLength']].to_excel(writer, sheet_name='Missing Titles', index=False)
    dupTitle[['Address', 'metaTitle', 'titleLength']].to_excel(writer, sheet_name='Duplicate Titles', index=False)
    des_over[['Address', 'metaDescription', 'descriptionLength']].to_excel(writer, sheet_name='Description Over', index=False)
    thinDescription[['Address', 'metaDescription', 'descriptionLength']].to_excel(writer, sheet_name='Description Under', index=False)
    missingDescription[['Address', 'metaDescription', 'descriptionLength']].to_excel(writer, sheet_name='Missing Description', index=False)
    dupDescription[['Address', 'metaDescription', 'descriptionLength']].to_excel(writer, sheet_name='Duplicate Description', index=False)
    pd.DataFrame(seo).to_excel(writer, sheet_name='Site Speed', index=False)
    pd.DataFrame(diagnostics).to_excel(writer, sheet_name='Web Metrics', index=False)