In [101]:
# recommended to use virtual env to install pip package
# see link official docs for installation reference: https://github.com/googleapis/python-analytics-data#installation
!pip install google-analytics-data

Defaulting to user installation because normal site-packages is not writeable


In [102]:
import datetime
import pandas as pd
import numpy as np

In [103]:
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    DateRange,
    Dimension,
    Metric,
    RunReportRequest,
    OrderBy,
)

In [104]:
# modified from official google docs for GA4: 
# https://developers.google.com/analytics/devguides/reporting/data/v1/quickstart-client-libraries

def pull_from_ga_into_df(dimensions, metrics, order_by):
    """Runs a simple report on a Google Analytics 4 property."""
    # TODO(developer): Uncomment this variable and replace with your
    #  Google Analytics 4 property ID before running the sample.
    # property_id = "YOUR-GA4-PROPERTY-ID"

    # Using a default constructor instructs the client to use the credentials
    # specified in GOOGLE_APPLICATION_CREDENTIALS environment variable.
    client = BetaAnalyticsDataClient()
    property_id="353368209"

    request = RunReportRequest(
        property=f"properties/{property_id}",
        dimensions=[Dimension(name=d) for d in dimensions],
        metrics=[Metric(name=m) for m in metrics],
        date_ranges=[DateRange(start_date="7daysAgo", end_date="today")],
        order_bys=[OrderBy(desc=True, dimension=OrderBy.DimensionOrderBy(dimension_name=order_by))]
    )
    response = client.run_report(request)
    
    ## process into pd dataframe
    
    # get headers
    data_dict = {}
    for d in response.dimension_headers:
        data_dict[d.name] = []

    for m in response.metric_headers:
        data_dict[m.name] = []

    # insert into dictionary
    for r in response.rows:
        vals = [v.value for v in [*r.dimension_values, *r.metric_values]]
        for i, key in enumerate(data_dict):
            data_dict[key].append(vals[i])

    df = pd.DataFrame(data=data_dict)
    
    return df

In [105]:
query_map = {}

## Audience Overview

In [106]:
query_map["AudienceOverview"] = {
    "metrics": [
        "averageSessionDuration",
        "bounceRate",
        "newUsers",
        "sessionsPerUser",
        "screenPageViewsPerUser",
        "screenPageViewsPerSession",
        "screenPageViews",
        "sessions",
        "activeUsers", # not sure if this is what we want for the api
    ],
    "dimensions": [
        "date",
    ],
    "order_by": "date"
}

In [107]:
audience_overview_df = pull_from_ga_into_df(
    query_map["AudienceOverview"]["dimensions"], 
    query_map["AudienceOverview"]["metrics"],
    query_map["AudienceOverview"]["order_by"],
)
audience_overview_df

Unnamed: 0,date,averageSessionDuration,bounceRate,newUsers,sessionsPerUser,screenPageViewsPerUser,screenPageViewsPerSession,screenPageViews,sessions,activeUsers
0,20230308,434.5394681,1.0,6,1.25,3.375,2.7,27,10,8
1,20230307,356.8390975,0.3333333333333333,6,1.0,2.333333333333333,2.333333333333333,14,6,6
2,20230306,154.80616195238093,0.619047619047619,15,1.3125,1.75,1.3333333333333333,28,21,16
3,20230305,15.566998857142858,0.5714285714285714,4,1.75,2.0,1.1428571428571428,8,7,4
4,20230304,219.40252,0.3333333333333333,2,2.0,4.0,2.0,12,6,3
5,20230303,9.9476896,0.7,10,1.0,1.2,1.2,12,10,10
6,20230302,253.1425468,0.4,3,1.25,1.75,1.4,7,5,4


## AudienceByHour

In [108]:
query_map["AudienceByHour"] = {
    "metrics": [
        "averageSessionDuration",
        "bounceRate",
        "newUsers",
        "sessionsPerUser",
        "screenPageViewsPerUser",
        "screenPageViewsPerSession",
        "screenPageViews",
        "sessions",
        "activeUsers", # not sure if this is what we want for the api
    ],
    "dimensions": [
        "dateHour",
    ],
    "order_by": "dateHour",
}

In [109]:
audience_by_hour_df = pull_from_ga_into_df(
    query_map["AudienceByHour"]["dimensions"], 
    query_map["AudienceByHour"]["metrics"],
    query_map["AudienceByHour"]["order_by"],
)
audience_by_hour_df

Unnamed: 0,dateHour,averageSessionDuration,bounceRate,newUsers,sessionsPerUser,screenPageViewsPerUser,screenPageViewsPerSession,screenPageViews,sessions,activeUsers
0,2023030819,623.6719155,1.0,0,1.0,1.0,1.0,2,2,2
1,2023030818,324.267862,1.0,1,1.0,5.0,5.0,5,1,1
2,2023030817,0.0,1.0,5,1.6666666666666667,1.6666666666666667,1.0,5,5,3
3,2023030814,0.0,1.0,0,1.0,1.0,1.0,1,1,1
4,2023030813,2773.782988,1.0,0,1.0,12.0,12.0,12,1,1
5,2023030808,0.0,1.0,0,1.0,2.0,2.0,2,1,1
6,2023030721,64.291115,0.0,1,1.0,2.0,2.0,2,1,1
7,2023030718,1578.958902,1.0,0,1.0,5.0,5.0,5,1,1
8,2023030717,363.112517,0.0,1,1.0,1.0,1.0,1,1,1
9,2023030716,81.674977,0.0,1,1.0,2.0,2.0,2,1,1


## AudienceByCategory

In [116]:
query_map["AudienceByCategory"] = {
    "metrics": [
        "newUsers",
        "activeUsers", # not sure if this is what we want for the api
    ],
    "dimensions": [
        "date",
        "country",
        "deviceCategory",
        "language",
#         "pagePath", ## this makes the request incompatible
    ],
    "order_by": "date",
}

In [117]:
audience_by_category_df = pull_from_ga_into_df(
    query_map["AudienceByCategory"]["dimensions"], 
    query_map["AudienceByCategory"]["metrics"],
    query_map["AudienceByCategory"]["order_by"],
)
audience_by_category_df

Unnamed: 0,country,date,deviceCategory,language,newUsers,activeUsers
0,Philippines,20230308,mobile,English,2,2
1,Philippines,20230308,desktop,English,1,2
2,Sweden,20230308,mobile,English,1,0
3,United States,20230308,desktop,English,1,2
4,United States,20230308,mobile,English,1,0
5,Sweden,20230308,desktop,English,0,2
6,Philippines,20230307,mobile,English,3,3
7,Pakistan,20230307,desktop,English,2,2
8,Philippines,20230307,desktop,English,1,1
9,Philippines,20230306,desktop,English,5,5
