# Ngram Analysis

### Initial Setup 
- This script uses the GSC API 
- If you don't have your GSC API Key, watch this video from Jean Chouinard first: https://www.youtube.com/watch?v=-uy4L4P1Ujs&t=4s
- This code was built off June Tao Ching's GSC code and was modified to audit my content accordingly
    - Source: https://towardsdatascience.com/access-google-search-console-data-on-your-site-by-using-python-3c8c8079d6f8

### What does the code do?

- The following code pulls Keyword Data from GSC and then runs an ngram analysis

### What is an Ngram Analysis?
- "An n-gram is a collection of n successive items in a text document that may include words, numbers, symbols, and punctuation." - https://www.mathworks.com/discovery/ngram.html

### How can Ngrams be used for SEO?
- You can use Ngrams to analyze keywords to find patterns patterns, topic cluster / internal linking ideas 
- Ngrams can used to analyze your page titles, urls, keywords, H-1s, anchor text (internal and external) to find specific patterns 

#### Examples:
- Ngrams on GSC Keywords to find Topic Clusters / Internal Linking opportunities
- Ngrams on competitor Keyword data / titles / H-1s to find other keywords / topics to build out
- Ngrams on competitors titles (best by links (ahrefs report)) to determine which pages are generating the most backlinks
- Ngrams on GSC Keywords / page titles to analyze the impact of an algorithm update 

In [None]:
from collections import defaultdict

import pickle
import pandas as pd 

from datetime import datetime, timedelta
from google_auth_oauthlib.flow import InstalledAppFlow
from apiclient.discovery import build

from collections import defaultdict


# GSC Class

class gsc_api:

    def __init__(self,website,start_date,end_date):

        self.website = website
        self.start_date = start_date
        self.end_date = end_date


    #How Script gets access to Reddit via developer API.

    
    
    
    def gsc_kw(self):

        SITE_URL = self.website

        OAUTH_SCOPE = ('https://www.googleapis.com/auth/webmasters.readonly', 'https://www.googleapis.com/auth/webmasters')

        # Redirect URI for installed apps
        REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
        
        
        # You must edit gsc_credentials and pickled_credentials to include YOUR username
        gsc_credentials = r'your credentials '
        
        # where your pickled credential will be stored

        pickled_credentials = r'c:\users\your_username\desktop\pickled_credential'


        try:
            credentials = pickle.load(open(pickled_credentials  + ".pickle", "rb"))
        except (OSError, IOError) as e:
            flow = InstalledAppFlow.from_client_secrets_file(gsc_credentials, scopes=OAUTH_SCOPE)
            credentials = flow.run_console()
            pickle.dump(credentials, open(pickled_credentials  + ".pickle", "wb"))

            # Connect to Search Console Service using the credentials 
        webmasters_service = build('webmasters', 'v3', credentials=credentials)

        maxRows = 25000
        i = 0
        output_rows = []
        start_date = datetime.strptime(self.start_date, "%Y-%m-%d")
        end_date = datetime.strptime(self.end_date, "%Y-%m-%d")
        
        def date_range(start_date, end_date, delta=timedelta(days=1)):

            current_date = start_date
            while current_date <= end_date:
                yield current_date
                current_date += delta
        print('script start date:', start_date)

        for date in date_range(start_date, end_date):
            date = date.strftime("%Y-%m-%d")
            i = 0
            while True:

                request = {
                    'startDate' : date,
                    'endDate' : date,
                    'dimensions' : ["page",'query'],
                    "searchType": "Web",
                    'rowLimit' : maxRows,
                    'startRow' : i * maxRows
                }

                response = webmasters_service.searchanalytics().query(siteUrl = SITE_URL, body=request).execute()
                if response is None:
                    break
                if 'rows' not in response:
                    break
                else:
                    for row in response['rows']:
                        page = row['keys'][0]
                        keyword = row['keys'][1]
                        output_row = [ page,keyword, row['clicks'], row['impressions'], row['ctr'], row['position']]
                        output_rows.append(output_row)
                    i = i + 1
        print('script end date:', end_date)

        df = pd.DataFrame(output_rows, columns=['Address','Keyword', 'Clicks', 'Impressions', 'CTR',  'Average Position'])
        df = df.groupby(['Address','Keyword']).agg({'Clicks':'sum','Impressions':'sum','Average Position':'mean'}).reset_index()
        df['CTR'] = df['Clicks'] / df['Impressions'] 
        return df

    
# Ngram Class
class n_gram:

    def __init__(self,data):

        self.data = data
        # self.column = column
        
    def generate_N_grams(self,text,ngram=1):
      self.text = text
      self.ngram = ngram
      words=[word for word in text.split(" ")]  
      print("Sentence after removing stopwords:",words)
      temp=zip(*[words[i:] for i in range(0,ngram)])
      ans=[' '.join(ngram) for ngram in temp]
      return ans
        
    def n_gram_function(self , s):  
        data = self.data
        # column = self.column
        gram = defaultdict(int)
        for text in data['Keyword']:
              for word in self.generate_N_grams(text,s):
                gram[word]+=1
        gram = pd.DataFrame(sorted(gram.items(),key=lambda x:x[1],reverse=True))
        return gram
    
    
    ## Returns Unigram 
    def unigram(self):
        unigram = self.n_gram_function(1)
        return unigram
    
    ## Returns Bigram 
    def bigram(self):
        bigram = self.n_gram_function(2)
        return bigram

    ## Returns Trigram 
    def trigram(self):
        trigram = self.n_gram_function(3)
        return trigram

    ## Returns Quadgram 
    def quadgram(self):
        quadgram = self.n_gram_function(4)
        return quadgram
    
    ## Returns fivegram 
    def quintgram(self):
        quintgram = self.n_gram_function(5)
        return quintgram

### the following code pulls data from the GSC API and then runs an Ngram analysis on your keywords

In [None]:
gsc_data = gsc_api('https://yourwebsite.com/','2022-10-01','2022-10-31')
df = gsc_data.gsc_kw()
ngram = n_gram(df)

#returns trigram analysis on keywords
trigram = ngram.trigram()

#returns bigram analysis on GSC Keywords
bigram = ngram.trigram()