# GA4 Realtime Anomaly Detection

## Identification

The next cell creates the path in the environment variables to access to the service account credentials

In [55]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'hackathon-analytics-pioneers.json'

The next cell aims only at testing the how-to written in the Google Analytics official documentation, to check that no error 403 is coming back.

In [56]:
import os
import pandas as pd
import itertools

property_id = "239560925"
starting_date = "8daysAgo"
ending_date = "yesterday"

from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    DateRange,
    Dimension,
    Metric,
    RunReportRequest,
)
client = BetaAnalyticsDataClient()

request_api = RunReportRequest(
    property=f"properties/{property_id}",
    dimensions=[
        Dimension(name="landingPagePlusQueryString")
        ],
        metrics=[
            Metric(name="sessions")
        ],
        date_ranges=[DateRange(start_date=starting_date, end_date=ending_date)],
    )
response = client.run_report(request_api)

Next, getting warm with Pandas Dataframe again:

In [23]:
def query_data(api_response):
    dimension_headers = [header.name for header in api_response.dimension_headers]
    metric_headers = [header.name for header in api_response.metric_headers]
    dimensions = []
    metrics = []
    for i in range(len(dimension_headers)):
        dimensions.append([row.dimension_values[i].value for row in api_response.rows])
    dimensions
    for i in range(len(metric_headers)):
        metrics.append([row.metric_values[i].value for row in api_response.rows])
    headers = dimension_headers, metric_headers
    headers = list(itertools.chain.from_iterable(headers))   
    data = dimensions, metrics
    data = list(itertools.chain.from_iterable(data))
    df = pd.DataFrame(data)
    df = df.transpose()
    df.columns = headers
    return df

query_data(response)

Unnamed: 0,landingPagePlusQueryString,sessions
0,/,94
1,(not set),20
2,/en,14
3,/en/about-us,10
4,/blog/chat-gpt-plugins,8
5,/blog/keywordrecherche-und-analyse,8
6,/blog/wieso-apache-nifi,4
7,/blog,3
8,/en/blog/why-adobe-analytics-campaign-tracking...,3
9,/en/blog/why-apache-nifi,3


Finally getting into the real stuff; playing with the Google Analytics RealTime API.

In [47]:
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    Dimension,
    Metric,
    RunRealtimeReportRequest,
)


def run_sample():
    """Runs the sample."""
    # TODO(developer): Replace this variable with your Google Analytics 4
    #  property ID before running the sample.
    property_id = "239560925"
    print(run_realtime_report(property_id))


def run_realtime_report(property_id):
    """Runs a realtime report on a Google Analytics 4 property."""
    client = BetaAnalyticsDataClient()

    request = RunRealtimeReportRequest(
        property=f"properties/{property_id}",
        dimensions=[Dimension(name="eventName"), Dimension(name="minutesAgo")],
        metrics=[Metric(name="eventCount")],
    )
    return client.run_realtime_report(request)

data= { run_sample() }

dimension_headers {
  name: "eventName"
}
dimension_headers {
  name: "minutesAgo"
}
metric_headers {
  name: "eventCount"
  type_: TYPE_INTEGER
}
rows {
  dimension_values {
    value: "page_view"
  }
  dimension_values {
    value: "11"
  }
  metric_values {
    value: "2"
  }
}
rows {
  dimension_values {
    value: "first_visit"
  }
  dimension_values {
    value: "11"
  }
  metric_values {
    value: "1"
  }
}
rows {
  dimension_values {
    value: "session_start"
  }
  dimension_values {
    value: "11"
  }
  metric_values {
    value: "1"
  }
}
rows {
  dimension_values {
    value: "user_engagement"
  }
  dimension_values {
    value: "11"
  }
  metric_values {
    value: "1"
  }
}
row_count: 4
kind: "analyticsData#runRealtimeReport"



Now, let's structure the data into a table with "minutesAgo" and "event_name" as dimensions and the event count as metric.

In [23]:
import pandas as pd
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    Dimension,
    Metric,
    RunRealtimeReportRequest,
)

def run_realtime_report(property_id):
    """Runs a realtime report on a Google Analytics 4 property."""
    client = BetaAnalyticsDataClient()

    request = RunRealtimeReportRequest(
        property=f"properties/{property_id}",
        dimensions=[Dimension(name="eventName"), Dimension(name="minutesAgo")],
        metrics=[Metric(name="eventCount")],
    )
    response = client.run_realtime_report(request)

    # Extract data from the response
    rows = []
    for row in response.rows:
        dimension_values = [value.value for value in row.dimension_values]
        metric_values = [value.value for value in row.metric_values]
        rows.append(dimension_values + metric_values)

    # Create headers
    dimension_headers = [header.name for header in response.dimension_headers]
    metric_headers = [header.name for header in response.metric_headers]
    headers = dimension_headers + metric_headers

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)
    return df

def run_sample():
    """Runs the sample."""
    # TODO(developer): Replace this variable with your Google Analytics 4
    #  property ID before running the sample.
    property_id = "239560925"
    df = run_realtime_report(property_id)
    display(df)

# Run the sample
run_sample()

Unnamed: 0,eventName,minutesAgo,eventCount
0,page_view,2,11
1,page_view,17,3
2,page_view,16,2
3,first_visit,0,1
4,page_view,0,1
5,scroll,17,1
6,session_start,0,1
7,session_start,17,1


For the sake of better readibility, the events are now structured in 5 min slots so that, even without a time-series chart, one can see the evolution of the events.

In [24]:
import pandas as pd
print(pd.__version__)
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    Dimension,
    Metric,
    RunRealtimeReportRequest,
)

def run_realtime_report(property_id):
    """Runs a realtime report on a Google Analytics 4 property."""
    client = BetaAnalyticsDataClient()

    request = RunRealtimeReportRequest(
        property=f"properties/{property_id}",
        dimensions=[Dimension(name="eventName"), Dimension(name="minutesAgo")],
        metrics=[Metric(name="eventCount")],
    )
    response = client.run_realtime_report(request)

    # Extract data from the response
    rows = []
    for row in response.rows:
        dimension_values = [value.value for value in row.dimension_values]
        metric_values = [value.value for value in row.metric_values]
        rows.append(dimension_values + metric_values)

    # Create headers
    dimension_headers = [header.name for header in response.dimension_headers]
    metric_headers = [header.name for header in response.metric_headers]
    headers = dimension_headers + metric_headers

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Convert minutesAgo to integer for easier comparison
    df['minutesAgo'] = df['minutesAgo'].astype(int)
    df['eventCount'] = df['eventCount'].astype(int)

    # Initialize new columns for the event counts in each range
    ranges = ['0-5min', '5-10min', '10-15min']
    for time_range in ranges:
        df[time_range] = 0

    # Categorize events into the respective time ranges
    for index, row in df.iterrows():
        event_count = row['eventCount']
        minutes = row['minutesAgo']
        if minutes < 5:
            df.at[index, '0-5min'] += event_count
        elif 5 <= minutes < 10:
            df.at[index, '5-10min'] += event_count
        elif 10 <= minutes < 15:
            df.at[index, '10-15min'] += event_count

    # Reduce DataFrame to the relevant columns and rename 'eventName' to 'Events'
    final_df = df[['eventName', '0-5min', '5-10min', '10-15min']].rename(columns={'eventName': 'Events'})

    # Group by 'Events' and sum the counts for each time range
    final_df = final_df.groupby('Events')[['0-5min', '5-10min', '10-15min']].sum().reset_index()

    return final_df

def run_sample():
    """Runs the sample."""
    # TODO(developer): Replace this variable with your Google Analytics 4
    #  property ID before running the sample.
    property_id = "239560925"
    df = run_realtime_report(property_id)
    display(df)

# Run the sample
run_sample()

2.2.2


Unnamed: 0,Events,0-5min,5-10min,10-15min
0,first_visit,1,0,0
1,page_view,12,0,0
2,scroll,0,0,0
3,session_start,1,0,0


Since we want to find anomalies in the number of events, I thought that a standard deviation calculation would help us in that.
Thankfully, the pandas package enables us to calculate the standard pretty easily with the `.std()` method.
Thus, with some quickly AI-generated code, we are able to get the number of events that are **above** the upper standard deviation curve.
What about the lower standard deviation curve?

In [59]:
import pandas as pd
print(pd.__version__)
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    Dimension,
    Metric,
    RunRealtimeReportRequest,
)

def run_realtime_report(property_id):
    """Runs a realtime report on a Google Analytics 4 property."""
    client = BetaAnalyticsDataClient()

    request = RunRealtimeReportRequest(
        property=f"properties/{property_id}",
        dimensions=[Dimension(name="eventName"), Dimension(name="minutesAgo")],
        metrics=[Metric(name="eventCount")],
    )
    response = client.run_realtime_report(request)

    # Extract data from the response
    rows = []
    for row in response.rows:
        dimension_values = [value.value for value in row.dimension_values]
        metric_values = [value.value for value in row.metric_values]
        rows.append(dimension_values + metric_values)

    # Create headers
    dimension_headers = [header.name for header in response.dimension_headers]
    metric_headers = [header.name for header in response.metric_headers]
    headers = dimension_headers + metric_headers

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Convert minutesAgo to integer for easier comparison
    df['minutesAgo'] = df['minutesAgo'].astype(int)
    df['eventCount'] = df['eventCount'].astype(int)

    # Initialize new columns for the event counts in each range
    ranges = ['0-5min', '5-10min', '10-15min']
    for time_range in ranges:
        df[time_range] = 0

    # Categorize events into the respective time ranges
    for index, row in df.iterrows():
        event_count = row['eventCount']
        minutes = row['minutesAgo']
        if minutes < 5:
            df.at[index, '0-5min'] += event_count
        elif 5 <= minutes < 10:
            df.at[index, '5-10min'] += event_count
        elif 10 <= minutes < 15:
            df.at[index, '10-15min'] += event_count

    # Reduce DataFrame to the relevant columns and rename 'eventName' to 'Events'
    final_df = df[['eventName', '0-5min', '5-10min', '10-15min']].rename(columns={'eventName': 'Events'})

    # Group by 'Events' and sum the counts for each time range
    final_df = final_df.groupby('Events')[['0-5min', '5-10min', '10-15min']].sum().reset_index()

    # Calculate the standard deviation for the event counts in each time range
    std_dev_values = final_df[['0-5min', '5-10min', '10-15min']].std()

    # Initialize a dictionary to count events exceeding the standard deviation
    exceeding_events = {'Events': [], 'count_over_std_dev': []}

    # Count how many events are above the standard deviation
    for index, row in final_df.iterrows():
        exceed_count = sum(row[['0-5min', '5-10min', '10-15min']] > std_dev_values)
        exceeding_events['Events'].append(row['Events'])
        exceeding_events['count_over_std_dev'].append(exceed_count)

    # Create DataFrame for exceeding events count
    exceeding_events_df = pd.DataFrame(exceeding_events)

    # Return both DataFrames
    return final_df, exceeding_events_df

def run_sample():
    """Runs the sample."""
    # TODO(developer): Replace this variable with your Google Analytics 4
    #  property ID before running the sample.
    property_id = "239560925"
    final_df, exceeding_events_df = run_realtime_report(property_id)
    display(final_df)
    display(exceeding_events_df)

# Run the sample
run_sample()

2.2.2


Unnamed: 0,Events,0-5min,5-10min,10-15min
0,click,0,0,0
1,custom_click,0,0,0
2,first_visit,0,1,0
3,page_view,0,3,0
4,scroll,0,0,0
5,session_start,0,1,0
6,user_engagement,0,1,0


Unnamed: 0,Events,count_over_std_dev
0,click,0
1,custom_click,0
2,first_visit,0
3,page_view,1
4,scroll,0
5,session_start,0
6,user_engagement,0


Same method as for the event count above the upper standard deviation curve, the following code adds a further table to show the event count **below** the lower standard deviation count.

In [61]:
import pandas as pd
print(pd.__version__)
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    Dimension,
    Metric,
    RunRealtimeReportRequest,
)

def run_realtime_report(property_id):
    """Runs a realtime report on a Google Analytics 4 property."""
    client = BetaAnalyticsDataClient()

    request = RunRealtimeReportRequest(
        property=f"properties/{property_id}",
        dimensions=[Dimension(name="eventName"), Dimension(name="minutesAgo")],
        metrics=[Metric(name="eventCount")],
    )
    response = client.run_realtime_report(request)

    # Extract data from the response
    rows = []
    for row in response.rows:
        dimension_values = [value.value for value in row.dimension_values]
        metric_values = [value.value for value in row.metric_values]
        rows.append(dimension_values + metric_values)

    # Create headers
    dimension_headers = [header.name for header in response.dimension_headers]
    metric_headers = [header.name for header in response.metric_headers]
    headers = dimension_headers + metric_headers

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Convert minutesAgo to integer for easier comparison
    df['minutesAgo'] = df['minutesAgo'].astype(int)
    df['eventCount'] = df['eventCount'].astype(int)

    # Initialize new columns for the event counts in each range
    ranges = ['0-5min', '5-10min', '10-15min']
    for time_range in ranges:
        df[time_range] = 0

    # Categorize events into the respective time ranges
    for index, row in df.iterrows():
        event_count = row['eventCount']
        minutes = row['minutesAgo']
        if minutes < 5:
            df.at[index, '0-5min'] += event_count
        elif 5 <= minutes < 10:
            df.at[index, '5-10min'] += event_count
        elif 10 <= minutes < 15:
            df.at[index, '10-15min'] += event_count

    # Reduce DataFrame to the relevant columns and rename 'eventName' to 'Events'
    final_df = df[['eventName', '0-5min', '5-10min', '10-15min']].rename(columns={'eventName': 'Events'})

    # Group by 'Events' and sum the counts for each time range
    final_df = final_df.groupby('Events')[['0-5min', '5-10min', '10-15min']].sum().reset_index()

    # Calculate the standard deviation for the event counts in each time range
    std_dev_values = final_df[['0-5min', '5-10min', '10-15min']].std()

    # Initialize dictionaries to count events exceeding and below the standard deviation
    exceeding_events = {'Events': [], 'count_over_std_dev': []}
    below_events = {'Events': [], 'count_below_std_dev': []}

    # Count how many events are above and below the standard deviation
    for index, row in final_df.iterrows():
        exceed_count = sum(row[['0-5min', '5-10min', '10-15min']] > std_dev_values)
        below_count = sum(row[['0-5min', '5-10min', '10-15min']] < std_dev_values)
        exceeding_events['Events'].append(row['Events'])
        exceeding_events['count_over_std_dev'].append(exceed_count)
        below_events['Events'].append(row['Events'])
        below_events['count_below_std_dev'].append(below_count)

    # Create DataFrames for exceeding and below events count
    exceeding_events_df = pd.DataFrame(exceeding_events)
    below_events_df = pd.DataFrame(below_events)

    # Return all DataFrames
    return final_df, exceeding_events_df, below_events_df

def run_sample():
    """Runs the sample."""
    # TODO(developer): Replace this variable with your Google Analytics 4
    #  property ID before running the sample.
    property_id = "239560925"
    final_df, exceeding_events_df, below_events_df = run_realtime_report(property_id)
    display(final_df)
    display(exceeding_events_df)
    display(below_events_df)

# Run the sample
run_sample()

2.2.2


Unnamed: 0,Events,0-5min,5-10min,10-15min
0,click,0,0,0
1,custom_click,0,0,0
2,first_visit,0,0,1
3,page_view,0,1,2
4,scroll,0,0,0
5,session_start,0,0,1
6,user_engagement,0,1,0


Unnamed: 0,Events,count_over_std_dev
0,click,0
1,custom_click,0
2,first_visit,1
3,page_view,2
4,scroll,0
5,session_start,1
6,user_engagement,1


Unnamed: 0,Events,count_below_std_dev
0,click,2
1,custom_click,2
2,first_visit,1
3,page_view,0
4,scroll,2
5,session_start,1
6,user_engagement,1


Here follows another trial with larger time slots. Unfortunately, the event count was not realistic enough. So here further development time will be needed for these larger time slots. 

In [62]:
import pandas as pd
print(pd.__version__)
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    Dimension,
    Metric,
    RunRealtimeReportRequest,
)

def run_realtime_report(property_id):
    """Runs a realtime report on a Google Analytics 4 property."""
    client = BetaAnalyticsDataClient()

    request = RunRealtimeReportRequest(
        property=f"properties/{property_id}",
        dimensions=[Dimension(name="eventName"), Dimension(name="minutesAgo")],
        metrics=[Metric(name="eventCount")],
    )
    response = client.run_realtime_report(request)

    # Extract data from the response
    rows = []
    for row in response.rows:
        dimension_values = [value.value for value in row.dimension_values]
        metric_values = [value.value for value in row.metric_values]
        rows.append(dimension_values + metric_values)

    # Create headers
    dimension_headers = [header.name for header in response.dimension_headers]
    metric_headers = [header.name for header in response.metric_headers]
    headers = dimension_headers + metric_headers

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Convert minutesAgo to integer for easier comparison
    df['minutesAgo'] = df['minutesAgo'].astype(int)
    df['eventCount'] = df['eventCount'].astype(int)

    # Initialize new columns for the event counts in each range
    ranges = ['0-30min', '30-60min', '60-90min']
    for time_range in ranges:
        df[time_range] = 0

    # Categorize events into the respective time ranges
    for index, row in df.iterrows():
        event_count = row['eventCount']
        minutes = row['minutesAgo']
        if minutes < 5:
            df.at[index, '0-30min'] += event_count
        elif 5 <= minutes < 10:
            df.at[index, '30-60min'] += event_count
        elif 10 <= minutes < 15:
            df.at[index, '60-90min'] += event_count

    # Reduce DataFrame to the relevant columns and rename 'eventName' to 'Events'
    final_df = df[['eventName', '0-30min', '30-60min', '60-90min']].rename(columns={'eventName': 'Events'})

    # Group by 'Events' and sum the counts for each time range
    final_df = final_df.groupby('Events')[['0-30min', '30-60min', '60-90min']].sum().reset_index()

    # Calculate the standard deviation for the event counts in each time range
    std_dev_values = final_df[['0-30min', '30-60min', '60-90min']].std()

    # Initialize dictionaries to count events exceeding and below the standard deviation
    exceeding_events = {'Events': [], 'count_over_std_dev': []}
    below_events = {'Events': [], 'count_below_std_dev': []}

    # Count how many events are above and below the standard deviation
    for index, row in final_df.iterrows():
        exceed_count = sum(row[['0-30min', '30-60min', '60-90min']] > std_dev_values)
        below_count = sum(row[['0-30min', '30-60min', '60-90min']] < std_dev_values)
        exceeding_events['Events'].append(row['Events'])
        exceeding_events['count_over_std_dev'].append(exceed_count)
        below_events['Events'].append(row['Events'])
        below_events['count_below_std_dev'].append(below_count)

    # Create DataFrames for exceeding and below events count
    exceeding_events_df = pd.DataFrame(exceeding_events)
    below_events_df = pd.DataFrame(below_events)

    # Return all DataFrames
    return final_df, exceeding_events_df, below_events_df

def run_sample():
    """Runs the sample."""
    # TODO(developer): Replace this variable with your Google Analytics 4
    #  property ID before running the sample.
    property_id = "239560925"
    final_df, exceeding_events_df, below_events_df = run_realtime_report(property_id)
    display(final_df)
    display(exceeding_events_df)
    display(below_events_df)

# Run the sample
run_sample()

2.2.2


Unnamed: 0,Events,0-30min,30-60min,60-90min
0,click,0,0,0
1,custom_click,0,0,0
2,first_visit,0,0,1
3,page_view,0,0,3
4,scroll,0,0,0
5,session_start,0,0,1
6,user_engagement,0,1,0


Unnamed: 0,Events,count_over_std_dev
0,click,0
1,custom_click,0
2,first_visit,0
3,page_view,1
4,scroll,0
5,session_start,0
6,user_engagement,1


Unnamed: 0,Events,count_below_std_dev
0,click,2
1,custom_click,2
2,first_visit,2
3,page_view,1
4,scroll,2
5,session_start,2
6,user_engagement,1
