## Connecting to Google Analytics with reporting API

In [22]:
# Importing modules for API setup and data conversion

from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import numpy as np
import json

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [2]:
# Google API params. Views = Academy only report

SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = 'keys/client_secrets.json'
VIEW_ID = 'ga:175337801'

### Func 1: Get data report from google analytics

In [3]:
def initialize_analyticsreporting():
    """Initializes an Analytics Reporting API V4 service object.
    Returns: An authorized Analytics Reporting API V4 service object."""

    credentials = ServiceAccountCredentials.from_json_keyfile_name(KEY_FILE_LOCATION, SCOPES)

    # Build the service object.
    analytics = build('analyticsreporting', 'v4', credentials=credentials)

    return analytics

### Func 2: Return report in json dict, plus metrics and dimensions used

In [4]:
def get_report(analytics, metrics, dimensions, start, end):
    """Queries the Analytics Reporting API V4.
    Args:
    analytics: An authorized Analytics Reporting API V4 service object.
    Returns:
    The Analytics Reporting API V4 response.
    Using date range 2020-11-09 to 2020-11-15 for testing purposes"""
    METS = [f'ga:{metric}' for metric in metrics]
    DIMS = [f'ga:{dimension}' for dimension in dimensions]
    
    return analytics.reports().batchGet(
        body={
            'reportRequests': [
                                {
                                    'viewId': VIEW_ID,
                                    'dateRanges': [{'startDate': start
                                                    , 'endDate': end}],
                                    'metrics': [{'expression': expression} for expression in METS],
                                    'orderBys': [{'fieldName': METS[0], 
                                                  'sortOrder': 'DESCENDING'}],
                                    'dimensions': [{'name': name} for name in DIMS]
                                }]
            }).execute(), METS, DIMS

### Func 3: Convert to pd DataFrame

In [5]:
def to_df(response, METS, DIMS):
    data_dict = {f"{i}": [] for i in DIMS + METS}
    
    for report in response.get('reports', []):
        rows = report.get('data', {}).get('rows', [])
        for row in rows:
            for i, key in enumerate(DIMS):
                data_dict[key].append(row.get('dimensions', [])[i])
            date_values = row.get('metrics', [])
            for values in date_values:
                all_values = values.get('values', [])
                for i, key in enumerate(METS):
                    data_dict[key].append(all_values[i])
                    
    df = pd.DataFrame(data=data_dict)
    df.columns = [col.split(':')[-1] for col in df.columns]
    
    return df

### Func 4: Final – putting it all together

In [6]:
def ga_to_df(metrics,dimensions,start,end):
    '''metrics = list, dimesnions = list,
    start = str, end = str'''
    analytics = initialize_analyticsreporting()
    response, METS, DIMS = get_report(analytics, metrics, dimensions, start, end)
    response_df = to_df(response, metrics, dimensions)
    
    return response_df

## Gettting general data for posts

In [7]:
# The first article, "Of trees and men", was published 29 September 2017.
# However Google Analytics only started tracking later, but better
# safe than sorry

df_posts = ga_to_df(metrics=['pageViews',
                             'avgTimeOnPage',
                             'avgSessionDuration', 
                             'sessions', 
                             'bounces'],
                    dimensions=['landingPagePath'],
                    start='2017-10-01',
                    end='today')

In [8]:
df_posts.head(10)

Unnamed: 0,landingPagePath,pageViews,avgTimeOnPage,avgSessionDuration,sessions,bounces
0,/academy/,28326,138.41571624768616,200.8415234476207,11579,5236
1,/academy/what-are-scope-1-2-3-emissions/,11511,948.9932279909708,128.42044784914555,10182,5247
2,/academy/is-it-too-late-for-our-planet/,7741,278.58076634109693,58.57550702028081,6410,5095
3,/academy/the-stakeholders-of-climate-change/,6314,488.442872687704,84.18572752548656,5395,4090
4,/academy/most-powerful-greenhouse-gas/,5959,316.65250965250965,30.366292960852785,5441,4816
5,/academy/sustainable-climate-change-organisation-partnership/,2537,156.66666666666666,114.4549795361528,1466,942
6,/academy/how-can-the-circular-economy-support-sustainable-development/,2046,642.5962264150943,96.83380123526108,1781,1342
7,/academy/climate-action-data-driven-approach/,1826,227.54918032786884,152.2632541133455,1094,801
8,/academy/ai-climate-change/,1555,719.61875,190.82672064777327,1235,543
9,/academy/the-benefits-of-monitoring-carbon-emissions-for-a-business/,1311,540.5324074074074,111.41917808219178,1095,700


## Defining a success measure

- Prepend 'https://plana.earth' to URLS
- Remove unwanted URLs comparing to permalinks (faster than eliminating urls by regex)
- Use calculation for avgSessionDuration (> 2 min) and bounce rate (< 30%) to determine success (1) or fail (0)
- Add success col (target) to dataframe to use in supervised learning

### 1. Prepending plana.earth

In [9]:
df_posts['landingPagePath'] = 'https://plana.earth' + df_posts['landingPagePath'].astype(str)

df_posts.head(10)

Unnamed: 0,landingPagePath,pageViews,avgTimeOnPage,avgSessionDuration,sessions,bounces
0,https://plana.earth/academy/,28326,138.41571624768616,200.8415234476207,11579,5236
1,https://plana.earth/academy/what-are-scope-1-2-3-emissions/,11511,948.9932279909708,128.42044784914555,10182,5247
2,https://plana.earth/academy/is-it-too-late-for-our-planet/,7741,278.58076634109693,58.57550702028081,6410,5095
3,https://plana.earth/academy/the-stakeholders-of-climate-change/,6314,488.442872687704,84.18572752548656,5395,4090
4,https://plana.earth/academy/most-powerful-greenhouse-gas/,5959,316.65250965250965,30.366292960852785,5441,4816
5,https://plana.earth/academy/sustainable-climate-change-organisation-partnership/,2537,156.66666666666666,114.4549795361528,1466,942
6,https://plana.earth/academy/how-can-the-circular-economy-support-sustainable-development/,2046,642.5962264150943,96.83380123526108,1781,1342
7,https://plana.earth/academy/climate-action-data-driven-approach/,1826,227.54918032786884,152.2632541133455,1094,801
8,https://plana.earth/academy/ai-climate-change/,1555,719.61875,190.82672064777327,1235,543
9,https://plana.earth/academy/the-benefits-of-monitoring-carbon-emissions-for-a-business/,1311,540.5324074074074,111.41917808219178,1095,700


### 2. Only keep URLs that are also present in the permalinks

In [10]:
permalinks = []

with open('../04_Data/academy_permalinks.txt', 'r') as filehandle:
    permalinks = [current_link.rstrip() for current_link in filehandle.readlines()]

In [11]:
permalinks

['https://plana.earth/academy/how-sustainable-is-your-office-christmas-party/',
 'https://plana.earth/academy/how-joe-biden-u-s-will-rejoin-paris-agreement/',
 'https://plana.earth/academy/playlist-for-the-planet-arab-world-music-for-the-vibrant-city-of-beirut/',
 'https://plana.earth/academy/how-sustainable-is-your-office-christmas-party/',
 'https://plana.earth/academy/how-joe-biden-u-s-will-rejoin-paris-agreement/',
 'https://plana.earth/academy/playlist-for-the-planet-arab-world-music-for-the-vibrant-city-of-beirut/',
 'https://plana.earth/academy/how-sustainable-is-your-office-christmas-party/',
 'https://plana.earth/academy/playlist-for-the-planet-disco-funk-music-love-disco-dance-playlist/',
 'https://plana.earth/academy/mitigating-climate-change-it-starts-with-better-ocean-data/',
 'https://plana.earth/academy/environmental-impact-2020-pandemic-covid19-nature-is-not-healing/',
 'https://plana.earth/academy/why-president-of-european-commission-has-spoken-about-plan-a/',
 'https:

In [17]:
posts_success = df_posts[df_posts.landingPagePath.isin(permalinks)]

In [18]:
posts_success.reset_index(inplace=True, drop=True)

In [20]:
posts_success.head(10)

Unnamed: 0,landingPagePath,pageViews,avgTimeOnPage,avgSessionDuration,sessions,bounces
0,https://plana.earth/academy/what-are-scope-1-2-3-emissions/,11511,948.9932279909708,128.42044784914555,10182,5247
1,https://plana.earth/academy/is-it-too-late-for-our-planet/,7741,278.58076634109693,58.57550702028081,6410,5095
2,https://plana.earth/academy/the-stakeholders-of-climate-change/,6314,488.442872687704,84.18572752548656,5395,4090
3,https://plana.earth/academy/most-powerful-greenhouse-gas/,5959,316.65250965250965,30.366292960852785,5441,4816
4,https://plana.earth/academy/sustainable-climate-change-organisation-partnership/,2537,156.66666666666666,114.4549795361528,1466,942
5,https://plana.earth/academy/how-can-the-circular-economy-support-sustainable-development/,2046,642.5962264150943,96.83380123526108,1781,1342
6,https://plana.earth/academy/climate-action-data-driven-approach/,1826,227.54918032786884,152.2632541133455,1094,801
7,https://plana.earth/academy/ai-climate-change/,1555,719.61875,190.82672064777327,1235,543
8,https://plana.earth/academy/the-benefits-of-monitoring-carbon-emissions-for-a-business/,1311,540.5324074074074,111.41917808219178,1095,700
9,https://plana.earth/academy/blockchain-versus-sustainability/,1078,219.3037974683544,37.80652173913043,920,836


In [26]:
posts_success[['avgSessionDuration', 'sessions', 'bounces']] = posts_success[['avgSessionDuration', 
                                                                              'sessions', 
                                                                              'bounces']].apply(pd.to_numeric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [27]:
posts_success.dtypes

landingPagePath        object
pageViews              object
avgTimeOnPage          object
avgSessionDuration    float64
sessions                int64
bounces                 int64
dtype: object

### 3. Calculate success measure

Success (1) = avgSessionDuration < 120 AND bounce rate (bounces/sessions) < 30%

In [32]:
posts_success['success'] = np.where((posts_success['avgSessionDuration'] >= 120) & 
                                    (posts_success['bounces'] / posts_success['sessions'] <= 0.8), 1, 0 ).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_success['success'] = np.where((posts_success['avgSessionDuration'] >= 120) &


In [33]:
posts_success

Unnamed: 0,landingPagePath,pageViews,avgTimeOnPage,avgSessionDuration,sessions,bounces,success
0,https://plana.earth/academy/what-are-scope-1-2-3-emissions/,11511,948.9932279909708,128.420448,10182,5247,1
1,https://plana.earth/academy/is-it-too-late-for-our-planet/,7741,278.58076634109693,58.575507,6410,5095,0
2,https://plana.earth/academy/the-stakeholders-of-climate-change/,6314,488.442872687704,84.185728,5395,4090,0
3,https://plana.earth/academy/most-powerful-greenhouse-gas/,5959,316.65250965250965,30.366293,5441,4816,0
4,https://plana.earth/academy/sustainable-climate-change-organisation-partnership/,2537,156.66666666666666,114.45498,1466,942,0
5,https://plana.earth/academy/how-can-the-circular-economy-support-sustainable-development/,2046,642.5962264150943,96.833801,1781,1342,0
6,https://plana.earth/academy/climate-action-data-driven-approach/,1826,227.54918032786884,152.263254,1094,801,1
7,https://plana.earth/academy/ai-climate-change/,1555,719.61875,190.826721,1235,543,1
8,https://plana.earth/academy/the-benefits-of-monitoring-carbon-emissions-for-a-business/,1311,540.5324074074074,111.419178,1095,700,0
9,https://plana.earth/academy/blockchain-versus-sustainability/,1078,219.3037974683544,37.806522,920,836,0
