In [13]:
import sklearn


import pandas as pd
from os import path

#directory = "results-gtm"
#directory = "results-sgtm"
#directory = "results-iter1"
#directory = "results-sgtm-smote"
#directory = "../out-content/results"
directory = "results-iter2"

columns = ["label", "prediction", "url", "visit_id"]
df = None


# I ran 10-fold cross validation so there are 10 different files for test data predictions
for i in range(10):
    # The ML model output files use a weird separator: " |$| "
    temp_df = pd.read_table(path.join(directory, f"tp_{i}"), header=None, sep="\ \|\$\|\ ", names=columns, engine='python')

    if df is None:
        df = temp_df
    else:
        df = pd.concat([df, temp_df])


  return func(*args, **kwargs)


## False Positives analysis
We want to check whether our model can find other server-side tracker requests (in addition to GTM)

In [14]:
# False Negatives
fn = df[df["prediction"] & ~df["label"]]

# False negatives where url contains "/j/collect" (that seems to be an alternative of "/g/collect")
#fn = df[df["prediction"] & ~df["label"] & df["url"].str.contains("/j/collect") ]

fn.to_csv("temp_df.csv")
fn

Unnamed: 0,label,prediction,url,visit_id
1102,False,True,https://ml314.com/utsync.ashx?pub=&adv=&et=0&e...,65860553664439
1107,False,True,https://www.google.ee/ads/ga-audiences?v=1&t=s...,65860553664439
1111,False,True,https://www.google.ee/ads/ga-audiences?v=1&t=s...,65860553664439
1112,False,True,https://googleads.g.doubleclick.net/pagead/vie...,65860553664439
1113,False,True,https://googleads.g.doubleclick.net/pagead/vie...,65860553664439
...,...,...,...,...
67583,False,True,https://ingest.quantummetric.com/horizon/lumen...,8792294440948776
67584,False,True,https://ingest.quantummetric.com/horizon/lumen...,8792294440948776
67858,False,True,https://sb.scorecardresearch.com/b?c1=2&c2=690...,8889417237623677
67859,False,True,https://sb.scorecardresearch.com/b?c1=2&c2=690...,8889417237623677


## All Server-side GTM predictions

In [24]:
def is_google(df):
    google_domains = ['region1.google-analytics.com', 'region1.analytics.google.com', 'stats.g.doubleclick.net', 'www.google-analytics.com', 'analytics.google.com']
    values = df["url"].str.contains("|".join(google_domains), regex=True)
    return values

df_sgtm = df[df["url"].str.contains('/g/collect') & ~ is_google(df)]
#df_sgtm.to_csv("temp_df.csv")

In [25]:
df_sgtm["prediction"].value_counts()


False    161
True       3
Name: prediction, dtype: int64

## Aggregated statistics
The WebGraph ML model outputs

In [22]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import os


def report_to_latex(report, table_name):
    """ Parse sklearn.metrics.classification_report into a latex table.

    Code adapted from: https://gist.github.com/Lorenzoantonelli/40454798ae53386a1d5b9c8bb60664d5
    """

    if report[0] == '\n':
        report = report[1:]
    if report[-1] == '\n':
        report = report[:-1]

    lines = report.split('\n')

    header = ["\\begin{table}",
              f"\\caption{{Classification Report for {table_name}}}",
              f"\\label{{table:classification:{table_name}}}",
              "\\centering",
              "\\begin{tabular}{r r r r r}",
              "& Precision & Recall & F-score & Support",
              "\\\\"]

    body = []
    for line in lines[2:-4]:
        row = line.split()
        if len(row) == 5:
            body.append(" & ".join(row) + "\\\\")

    body.append("\\\\")

    footer = []
    for line in lines[-3:]:
        row = line.split()
        if len(row) == 3:
            footer.append("{} & & & {} & {}\\\\".format(*row))
        elif len(row) == 6:
            footer.append("{} {} & {} & {} & {} & {}\\\\".format(*row))

    footer.extend(["\\end{tabular}", "\\end{table}"])

    latex_table = '\n'.join(header + body + footer)

    return latex_table

result_dirs  = [ f.name for f in os.scandir(".") if f.is_dir() and f.name.startswith("results-") ]

df_dict = {}
for dirname in result_dirs:
    # Skip unfinished ML jobs
    if not os.path.isfile(os.path.join(dirname, "tp_9")):
        continue

    columns = ["label", "prediction", "url", "visit_id"]
    #df = pd.DataFrame(columns=columns)
    df = None

    # I ran 10-fold cross validation so there are 10 different files for test data predictions
    for i in range(10):
        # The ML model output files use a weird separator: " |$| "
        temp_df = pd.read_table(path.join(dirname, f"tp_{i}"), header=None, sep="\ \|\$\|\ ", names=columns, engine='python', usecols=["label", "prediction"])
        if df is None:
            df = temp_df
        else:
            df = pd.concat([df, temp_df])

    #df_dict[dirname] = df.copy()


    print("#######################################################################################################################################")
    print(dirname)
    results = classification_report(y_true=df["label"].to_numpy(), y_pred=df["prediction"].to_numpy())
    #print(report_to_latex(results, dirname))
    print(results)

#######################################################################################################################################
results-sgtm-smote
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    678328
        True       0.52      0.16      0.24       164

    accuracy                           1.00    678492
   macro avg       0.76      0.58      0.62    678492
weighted avg       1.00      1.00      1.00    678492

#######################################################################################################################################
results-gtm
              precision    recall  f1-score   support

       False       0.99      1.00      1.00    765544
        True       0.59      0.32      0.42      6478

    accuracy                           0.99    772022
   macro avg       0.79      0.66      0.71    772022
weighted avg       0.99      0.99      0.99    772022

###########################################