# Iterative labelling
I initially labelled GTM requests based on the presence of the string `/g/collect` in request url. Running WebGraph classification pipeline has flagged other similar GTM tracking requests. I use this notebook to iteratively improve on my original labelling.

In [1]:
import pandas as pd

df = pd.read_csv("labelled.csv")

## Only SGTM requests
Here I discard all generic GTM requests and leave only SGTM.

Results: Too few SGTM examples to train on - all SGTM requests were False Negatives

In [5]:
def is_google(df):
    google_domains = ['region1.google-analytics.com', 'region1.analytics.google.com', 'stats.g.doubleclick.net', 'www.google-analytics.com', 'analytics.google.com']
    values = df["name"].str.contains("|".join(google_domains), regex=True)
    return values

df_sgtm = df.copy()
# Only keep "True" label for sgtm
df_sgtm["label"] = ~ is_google(df) & df["label"]
df_sgtm[df_sgtm["label"]]

Unnamed: 0.1,Unnamed: 0,visit_id,top_level_url,name,braveblock_label,label
6121,128,6.586055e+13,https://www.procore.com/en-gb,https://sgtm.procore.com/g/collect?v=2&tid=G-P...,False,True
14201,48,1.511883e+14,https://www.asu.edu/,https://sstm.asu.edu/g/collect?v=2&tid=G-TEHJR...,False,True
14307,66,1.532234e+14,https://www.invisalign.com/,https://collect.invisalign.com/g/collect?v=2&t...,False,True
16006,116,1.729025e+14,https://www.hubspot.com/,https://sgtm-amer.hubspot.com/g/collect?v=2&ti...,False,True
18782,136,1.899881e+14,https://asana.com/,https://t-antenna.asana.com/g/collect?v=2&tid=...,False,True
...,...,...,...,...,...,...
809165,85,8.792294e+15,https://www.centurylink.com/,https://ssgtm.centurylink.com/g/collect?v=2&ti...,False,True
817009,163,8.864015e+15,https://www.livesport.cz/,https://sgtm.livesport.cz/g/collect?v=2&tid=G-...,False,True
817516,128,8.865186e+15,https://www.mobile.de/,https://tagging.mobile.de/g/collect?v=2&tid=G-...,False,True
821417,106,8.921257e+15,https://www.topuniversities.com/,https://ssgtm.topuniversities.com/g/collect?v=...,False,True


In [7]:
df_sgtm.to_csv("labelled-sgtm.csv")
del df_sgtm

## Iterative labelling
### Iteration 1
Add positive labels to `/j/collect`

In [3]:
df_iter1 = df.copy()

df_iter1["label"] = df["label"] | df["name"].str.contains("/j/collect")
df_iter1.to_csv("labelled-iter1.csv")
df_iter1[df_iter1["name"].str.contains("/j/collect")]

Unnamed: 0.1,Unnamed: 0,visit_id,top_level_url,name,braveblock_label,label
227,12,1.855331e+12,https://www.hdtube.porn/,https://www.google-analytics.com/j/collect?v=1...,True,True
338,29,4.494398e+12,https://www.mass.gov/,https://www.google-analytics.com/j/collect?v=1...,True,True
709,158,9.195395e+12,https://nyahentai.re/,https://www.google-analytics.com/j/collect?v=1...,True,True
2295,72,3.713770e+13,https://javtiful.com/main,https://www.google-analytics.com/j/collect?v=1...,True,True
3642,157,5.105575e+13,https://corp.world.co.jp/,https://www.google-analytics.com/j/collect?v=1...,True,True
...,...,...,...,...,...,...
826907,56,8.961515e+15,https://www.rackspace.com/,https://www.google-analytics.com/j/collect?v=1...,True,True
826940,96,8.961515e+15,https://www.rackspace.com/,https://www.google-analytics.com/j/collect?v=1...,True,True
829975,13,8.996962e+15,http://livedns.co.uk/,https://www.google-analytics.com/j/collect?v=1...,True,True
830095,39,9.000427e+15,https://kubernetes.io/,https://www.google-analytics.com/j/collect?v=1...,True,True
