In [1]:
# recommended to use virtual env to install pip package
# see link official docs for installation reference: https://github.com/googleapis/python-analytics-data#installation
%pip install google-analytics-data

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import datetime
import pandas as pd
import numpy as np

In [3]:
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    DateRange,
    Dimension,
    Metric,
    RunReportRequest,
    OrderBy,
)

In [4]:
# modified from official google docs for GA4: 
# https://developers.google.com/analytics/devguides/reporting/data/v1/quickstart-client-libraries

def pull_from_ga_into_df(dimensions, metrics, order_by):
    """Runs a simple report on a Google Analytics 4 property."""
    # TODO(developer): Uncomment this variable and replace with your
    #  Google Analytics 4 property ID before running the sample.
    # property_id = "YOUR-GA4-PROPERTY-ID"

    # Using a default constructor instructs the client to use the credentials
    # specified in GOOGLE_APPLICATION_CREDENTIALS environment variable.
    client = BetaAnalyticsDataClient()
    property_id="353368209"

    request = RunReportRequest(
        property=f"properties/{property_id}",
        dimensions=[Dimension(name=d) for d in dimensions],
        metrics=[Metric(name=m) for m in metrics],
        date_ranges=[DateRange(start_date="30daysAgo", end_date="today")],
        order_bys=[OrderBy(desc=True, dimension=OrderBy.DimensionOrderBy(dimension_name=order_by))]
    )
    response = client.run_report(request)
    
    ## process into pd dataframe
    
    # get headers
    data_dict = {}
    for d in response.dimension_headers:
        data_dict[d.name] = []

    for m in response.metric_headers:
        data_dict[m.name] = []

    # insert into dictionary
    for r in response.rows:
        vals = [v.value for v in [*r.dimension_values, *r.metric_values]]
        for i, key in enumerate(data_dict):
            data_dict[key].append(vals[i])

    df = pd.DataFrame(data=data_dict)
    
    return df

In [5]:
def cast_types(df, type_map):
    pass

In [6]:
query_map = {}

In [7]:
query_map["AudienceOverview"] = {
    "metrics": [
        "averageSessionDuration",
        "bounceRate",
        "newUsers",
        "sessionsPerUser",
        "screenPageViewsPerSession",
        "screenPageViews",
        "sessions",
        "activeUsers", # not sure if this is what we want for the api
    ],
    "dimensions": [
        "date",
        "hostName"
    ],
    "order_by": "date"
}

In [8]:
audience_overview_df = pull_from_ga_into_df(
    query_map["AudienceOverview"]["dimensions"], 
    query_map["AudienceOverview"]["metrics"],
    query_map["AudienceOverview"]["order_by"],
)
audience_overview_df

DefaultCredentialsError: File /media/seanpe/Data/Work/UnicornConnect/avc_google_analytics/service-account.json
 was not found.

In [None]:
audience_overview_df

In [None]:
query_map["AudienceByHour"] = {
    "metrics": [
        "averageSessionDuration",
        "bounceRate",
        "newUsers",
        "sessionsPerUser",
        "screenPageViewsPerSession",
        "screenPageViews",
        "sessions",
        "activeUsers", # not sure if this is what we want for the api
    ],
    "dimensions": [
        "date",
        "dateHour",
        "hostName",
    ],
    "order_by": "dateHour",
}

In [None]:
audience_by_hour_df = pull_from_ga_into_df(
    query_map["AudienceByHour"]["dimensions"], 
    query_map["AudienceByHour"]["metrics"],
    query_map["AudienceByHour"]["order_by"],
)
audience_by_hour_df

In [None]:
query_map["AudienceByCategory"] = {
    "metrics": [
        "newUsers",
        "activeUsers", # not sure if this is what we want for the api
    ],
    "dimensions": [
        "date",
        "country",
        "deviceCategory",
        "language",
#         "pagePath", ## this makes the request incompatible
    ],
    "order_by": "date",
}

In [None]:
audience_by_category_df = pull_from_ga_into_df(
    query_map["AudienceByCategory"]["dimensions"], 
    query_map["AudienceByCategory"]["metrics"],
    query_map["AudienceByCategory"]["order_by"],
)
audience_by_category_df

In [None]:
query_map["GaOrders"] = {
    "metrics": [
        "purchaseRevenue",
        "activeUsers", # not sure if this is what we want for the api
    ],
    "dimensions": [
        "month",
        "date",
        "transactionId",
        "campaignName",
    ],
    "order_by": "campaignName",
}

In [None]:
ga_orders_df = pull_from_ga_into_df(
    query_map["GaOrders"]["dimensions"], 
    query_map["GaOrders"]["metrics"],
    query_map["GaOrders"]["order_by"],
)
ga_orders_df

In [None]:
query_map["PageTitleViews"] = {
    "metrics": [
        "screenPageViews", # not sure either
    ],
    "dimensions": [
        "date",
        "hostName",
        "landingPagePlusQueryString",
        "pagePath", # could alternatively be pageLocation
        "pageTitle",
        "percentScrolled", # assuming this is page depth
    ],
    "order_by": "date",
}

In [None]:
page_title_views_df = pull_from_ga_into_df(
    query_map["PageTitleViews"]["dimensions"], 
    query_map["PageTitleViews"]["metrics"],
    query_map["PageTitleViews"]["order_by"],
)
page_title_views_df

In [None]:
query_map["Acquisition"] = {
    "metrics": [
        "averageSessionDuration",
        "bounceRate",
        "totalRevenue",
        "screenPageViewsPerSession",
        "sessions",
        "newUsers",
        "totalUsers",
    ],
    "dimensions": [
        "date",
#         "campaignName",
        "month",
        "transactionId",
#         "sourceMedium",
        "adSourceName", # alternatively firstUserGoogleAdsAdNetworkType, 
        # googleAdsAdNetworkType, sessionGoogleAdsAdNetworkType
        "hostName",
    ],
    "order_by": "date",
}

In [None]:
acquisition_df = pull_from_ga_into_df(
    query_map["Acquisition"]["dimensions"], 
    query_map["Acquisition"]["metrics"],
    query_map["Acquisition"]["order_by"],
)
acquisition_df