# Preparation and Imports


In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import glob
import os
from common import *
import json
from datetime import datetime
from matplotlib import ticker
from matplotlib.colors import LogNorm, Normalize
import seaborn as sns



# Load and Prepare Data

## Website Lists

### Tranco


In [26]:
tranco = pd.read_csv('tranco_top-1m_N7QVW_2022-10-07.csv',
                     names=['pagerank', 'domain']).iloc[0:100000, :]
#tranco['rank_group'] = pd.cut(tranco.pagerank, bins=rank_group_bins)
tranco


Unnamed: 0,pagerank,domain
0,1,google.com
1,2,gtld-servers.net
2,3,youtube.com
3,4,facebook.com
4,5,microsoft.com
...,...,...
99995,99996,mrbasic.com
99996,99997,chartmogul.com
99997,99998,blogpeople.net
99998,99999,thevore.com


### Crawl Domains


In [27]:
crawlDomains = pd.read_csv(
    'tranco_top-100k_N7QVW_2022-10-07_with_cmp', names=['domain'])
crawlDomains


Unnamed: 0,domain
0,google.com
1,facebook.com
2,netflix.com
3,twitter.com
4,youtube.com
...,...
11952,chinataiwan.org
11953,messybusyrental.com
11954,atterley.com
11955,thevore.com


## Status Reports

### Filter Crawl


In [28]:
filterCrawlStatus = pd.read_csv('crawl_results_filter/filterCrawlStatusReport.csv').merge(tranco)
filterCrawlStatus


Unnamed: 0,domain,status,pagerank
0,google.com,compatible CMP detected,1
1,gtld-servers.net,not reached,2
2,akamaiedge.net,not reached,6
3,facebook.com,compatible CMP detected,4
4,netflix.com,compatible CMP detected,7
...,...,...,...
99995,chartmogul.com,no CMP detected,99997
99996,sfmlab.com,no CMP detected,100000
99997,shou.edu.cn,no CMP detected,99984
99998,bart.nl,no CMP detected,99988


### Wrongly Tagged Domains

Some domains were wrongly tagged as [with CMP] after the filter crawl.
They should not have been crawled in the measurement crawl. 

In [29]:
wronglyTaggedDomains = pd.read_csv('filterCrawlWronglyTaggedAsWithCmp.csv')
wronglyTaggedDomains

Unnamed: 0,pageUrl
0,qq.com
1,hp.com
2,flipkart.com
3,eventbrite.com
4,line.me
...,...
2705,xumk.cn
2706,toshiba.eu
2707,okx1.biz
2708,xfrb.com.cn


### Fingerprinting Crawl


In [30]:
# .astype({'hasBanner':'bool'})
statusReport = pd.read_json(
    'crawl_results_fingerprinting/statusReport.json.gz').merge(tranco)

statusReport['wronglyTagged'] = statusReport.domain.isin(wronglyTaggedDomains.pageUrl)

statusReport.start = pd.to_datetime(statusReport.start)
statusReport.end = pd.to_datetime(statusReport.end)

statusReport['status'] = 'error in fingerprinting crawl'
statusReport.loc[~(statusReport.hasErrors), 'status'] = 'successful'

statusReport['statusSimplified'] = statusReport.error
statusReport.loc[statusReport.error == 'Successful', 'statusSimplified'] = 'successful'
statusReport.loc[statusReport.error == 'argtype error', 'statusSimplified'] = 'other crawler error'
statusReport.loc[statusReport.error == 'unspecified error', 'statusSimplified'] = 'other crawler error'
statusReport.loc[statusReport.error == 'domain skipped', 'statusSimplified'] = 'other crawler error'



statusReport['cmp_simplified'] = statusReport.cmp
statusReport.loc[(~(statusReport.cmp.isna())) & (
    statusReport.cmp.str.startswith('trustarc')), 'cmp_simplified'] = 'trustarc'
statusReport.loc[(~(statusReport.cmp.isna())) & (
    statusReport.cmp.str.startswith('onetrust')), 'cmp_simplified'] = 'onetrust'
statusReport.loc[(~(statusReport.cmp.isna())) & (
    statusReport.cmp.str.startswith('sourcepoint')), 'cmp_simplified'] = 'sourcepoint'
statusReport.loc[(~(statusReport.cmp.isna())) & (
    statusReport.cmp.str.startswith('yahoo')), 'cmp_simplified'] = 'yahoo'
statusReport.loc[(~(statusReport.cmp.isna())) & (
    statusReport.cmp.str.startswith('Evidon')), 'cmp_simplified'] = 'Evidon'
statusReport.loc[(~(statusReport.cmp.isna())) & (statusReport.cmp.str.startswith(
    'cookieinformation')), 'cmp_simplified'] = 'cookieinformation'
statusReport.loc[(~(statusReport.cmp.isna())) & (
    statusReport.cmp.str.startswith('quantcast')), 'cmp_simplified'] = 'quantcast'
statusReport.loc[(~(statusReport.cmp.isna())) & (
    statusReport.cmp.str.startswith('paypal')), 'cmp_simplified'] = 'paypal'
statusReport.loc[(~(statusReport.cmp.isna())) & (statusReport.cmp.str.startswith(
    'cookieinformation')), 'cmp_simplified'] = 'cookieinformation'
statusReport.loc[(~(statusReport.cmp.isna())) & (
    statusReport.cmp.str.startswith('google')), 'cmp_simplified'] = 'google'
statusReport.loc[(~(statusReport.cmp.isna())) & (
    statusReport.cmp.str.startswith('bbc_fc')), 'cmp_simplified'] = 'funding choices'

statusReport['tld'] = statusReport.domain.str.split('.').str[-1]

statusReport


Unnamed: 0,domain,reachedDomain,start,end,hasErrors,error,originalFile,cmp,cmps,pagerank,wronglyTagged,status,statusSimplified,cmp_simplified,tld
0,yahoo.com,yahoo.com,2022-11-22 09:38:50.097000+00:00,2022-11-22 09:42:56.915000+00:00,False,Successful,results.json_00000000-00000009.json,yahoo_popup,"[yahoo_popup, yahoo_consent]",16,False,successful,successful,yahoo,com
1,google.com,google.com,2022-11-22 09:38:50.069000+00:00,2022-11-22 09:42:59.023000+00:00,False,Successful,results.json_00000000-00000009.json,google_popup,[google_popup],1,False,successful,successful,google,com
2,facebook.com,facebook.com,2022-11-22 09:38:50.095000+00:00,2022-11-22 09:42:59.675000+00:00,False,Successful,results.json_00000000-00000009.json,Facebook,[Facebook],4,False,successful,successful,Facebook,com
3,googlevideo.com,google.com,2022-11-22 09:38:50.099000+00:00,2022-11-22 09:43:18.030000+00:00,False,Successful,results.json_00000000-00000009.json,google_popup,[google_popup],36,False,successful,successful,google,com
4,cloudflare.com,cloudflare.com,2022-11-22 09:38:50.097000+00:00,2022-11-22 09:43:57.246000+00:00,False,Successful,results.json_00000000-00000009.json,onetrust_banner,"[onetrust_banner, onetrust_pcpanel]",14,False,successful,successful,onetrust,com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952,fdocuments.in,fdocuments.in,2022-12-05 07:46:58.361000+00:00,2022-12-05 07:53:54.555000+00:00,False,Successful,results.json_00012110-00012114.json,bbc_fc,[bbc_fc],96374,False,successful,successful,funding choices,in
11953,dimelo.com,,2022-12-05 07:46:58.364000+00:00,2022-12-05 07:56:30.764000+00:00,True,unspecified error,results.json_00012110-00012114.json,,,99618,False,error in fingerprinting crawl,other crawler error,,com
11954,srv00.com,srv00.com,2022-12-05 07:46:58.362000+00:00,2022-12-05 08:01:08.431000+00:00,True,no CMP,results.json_00012110-00012114.json,,,97483,False,error in fingerprinting crawl,no CMP,,com
11955,cgmagonline.com,,2022-12-05 07:46:58.363000+00:00,2022-12-05 08:06:21.590000+00:00,True,timeout error,results.json_00012110-00012114.json,,,97805,False,error in fingerprinting crawl,timeout error,,com


### Wrongly Tagged Insights and Fix

In [31]:
statusReport.pivot_table(values='domain',index='error',columns='wronglyTagged',aggfunc='count',margins=True)

wronglyTagged,False,True,All
error,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Successful,5663,155,5818
argtype error,26,3,29
domain skipped,21,3,24
no CMP,1802,2161,3963
timeout error,788,39,827
unreachable,475,142,617
unspecified error,472,207,679
All,9247,2710,11957


In [32]:
statusReport.loc[(statusReport.wronglyTagged) & (statusReport.error != 'Successful'),'error'] = 'scanned unintentionally'
statusReport.loc[(statusReport.wronglyTagged) & (statusReport.error != 'Successful'),'statusSimplified'] = 'scanned unintentionally'

In [33]:
statusReport.pivot_table(values='domain',index='error',columns='wronglyTagged',aggfunc='count',margins=True)

wronglyTagged,False,True,All
error,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Successful,5663.0,155.0,5818
argtype error,26.0,,26
domain skipped,21.0,,21
no CMP,1802.0,,1802
scanned unintentionally,,2555.0,2555
timeout error,788.0,,788
unreachable,475.0,,475
unspecified error,472.0,,472
All,9247.0,2710.0,11957


In [34]:
succesfulDomains= statusReport[~(statusReport.hasErrors)].shape[0]
succesfulDomains

5818

## Flows


In [35]:
flowReport = pd.read_json(
    'crawl_results_fingerprinting/summarizedFlowReport.json.gz').astype({'page': 'int'})

flowReport.loc[flowReport.destinationHost.isna(),
               'destinationHost'] = 'UNKNOWN'
flowReport['destinationSimplified'] = flowReport.destinationHost
flowReport.loc[flowReport.destinationHost ==
               flowReport.pageHost, 'destinationSimplified'] = 'first-party'


flowReport['isFirstParty'] = flowReport.pageHost == flowReport.destinationHost
flowReport['destinationCategory'] = 'third-party'
flowReport.loc[flowReport.isFirstParty, 'destinationCategory'] = 'first-party'
flowReport.loc[flowReport.destinationHost ==
               'UNKNOWN', 'destinationCategory'] = 'UNKNOWN'

flowReport['transmission'] = flowReport.source + \
    ':' + flowReport.destinationHost

flowReport['sinkSimplified'] = 'other'
flowReport.loc[flowReport.sink.str.startswith('XML'),'sinkSimplified'] = 'XML HTTP Request'
flowReport.loc[flowReport.sink.str.startswith('fetch'),'sinkSimplified'] = 'Fetch API'
flowReport.loc[flowReport.sink.str.startswith('img.'),'sinkSimplified'] = 'image source'
flowReport.loc[flowReport.sink.str.startswith('WebSocket'),'sinkSimplified'] = 'Web Socket'
flowReport.loc[flowReport.sink.str.startswith('iframe'),'sinkSimplified'] = 'iFrame source'
flowReport.loc[flowReport.sink.str.startswith('document.cookie'),'sinkSimplified'] = 'Cookie'
flowReport.loc[flowReport.sink.str.startswith('script'),'sinkSimplified'] = 'script source'
flowReport.loc[flowReport.sink.str.startswith('navigator.sendBeacon'),'sinkSimplified'] = 'sendBeacon()'


# Exclude flows to malformed destination URLs
# Amount of those is negligible (see output of postprocessing for details)
before = flowReport.shape[0]
print(f'Lines before excluding malformed destinations: {before}')
flowReport = flowReport[flowReport.destinationHost != 'UNKNOWN']
print(f'Lines after  excluding malformed destinations: {flowReport.shape[0]}')
print(f'Difference between before and after: {before-flowReport.shape[0]}')


flowReport


Lines before excluding malformed destinations: 1800846
Lines after  excluding malformed destinations: 1800119
Difference between before and after: 727


Unnamed: 0,domain,consentMode,page,pageHost,frameHost,destinationHost,source,sourceHost,sink,sinkHost,destinationSimplified,isFirstParty,destinationCategory,transmission,sinkSimplified
0,yahoo.com,acceptAll,2,yahoo.com,yahoo.com,yahoo.com,Screen.width,yimg.com,navigator.sendBeacon(url),yimg.com,first-party,True,first-party,Screen.width:yahoo.com,sendBeacon()
1,yahoo.com,acceptAll,2,yahoo.com,yahoo.com,yahoo.com,Screen.height,yimg.com,navigator.sendBeacon(url),yimg.com,first-party,True,first-party,Screen.height:yahoo.com,sendBeacon()
2,yahoo.com,acceptAll,2,yahoo.com,yahoo.com,yahoo.com,Screen.availWidth,yimg.com,navigator.sendBeacon(url),yimg.com,first-party,True,first-party,Screen.availWidth:yahoo.com,sendBeacon()
3,yahoo.com,acceptAll,2,yahoo.com,yahoo.com,yahoo.com,Screen.availHeight,yimg.com,navigator.sendBeacon(url),yimg.com,first-party,True,first-party,Screen.availHeight:yahoo.com,sendBeacon()
4,yahoo.com,acceptAll,2,yahoo.com,yahoo.com,yahoo.com,Screen.width,yimg.com,navigator.sendBeacon(url),yimg.com,first-party,True,first-party,Screen.width:yahoo.com,sendBeacon()
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1800841,fdocuments.in,doNothing,3,fdocuments.in,fdocuments.in,google-analytics.com,Screen.height,google-analytics.com,XMLHttpRequest.open(url),google-analytics.com,google-analytics.com,False,third-party,Screen.height:google-analytics.com,XML HTTP Request
1800842,fdocuments.in,doNothing,4,fdocuments.in,fdocuments.in,google-analytics.com,Navigator.language,google-analytics.com,XMLHttpRequest.open(url),google-analytics.com,google-analytics.com,False,third-party,Navigator.language:google-analytics.com,XML HTTP Request
1800843,fdocuments.in,doNothing,4,fdocuments.in,fdocuments.in,google-analytics.com,Screen.colorDepth,google-analytics.com,XMLHttpRequest.open(url),google-analytics.com,google-analytics.com,False,third-party,Screen.colorDepth:google-analytics.com,XML HTTP Request
1800844,fdocuments.in,doNothing,4,fdocuments.in,fdocuments.in,google-analytics.com,Screen.width,google-analytics.com,XMLHttpRequest.open(url),google-analytics.com,google-analytics.com,False,third-party,Screen.width:google-analytics.com,XML HTTP Request


In [37]:
### collect combinations
attribute_combinations = flowReport.groupby(['domain','consentMode','destinationHost'])['source'].apply(lambda x: list(x.unique())).reset_index(name='attribute_combinations')

attribute_combinations

Unnamed: 0,domain,consentMode,destinationHost,attribute_combinations
0,01net.com,acceptAll,01net.com,[Navigator.userAgent]
1,01net.com,acceptAll,criteo.com,"[Navigator.language, Screen.width, Screen.height]"
2,01net.com,acceptAll,doubleclick.net,"[Navigator.userAgent, Screen.width, Screen.hei..."
3,01net.com,acceptAll,google-analytics.com,"[Navigator.language, Screen.width, Screen.heig..."
4,01net.com,acceptAll,google.com,"[Navigator.language, Screen.width, Screen.height]"
...,...,...,...,...
72382,zzounds.com,acceptAll,zzounds.com,"[Screen.width, Screen.height, Navigator.userAg..."
72383,zzounds.com,doNothing,google-analytics.com,"[Navigator.language, Screen.colorDepth, Screen..."
72384,zzounds.com,doNothing,zzounds.com,"[Screen.width, Screen.height, Navigator.userAg..."
72385,zzounds.com,rejectAll,google-analytics.com,"[Navigator.language, Screen.colorDepth, Screen..."


In [40]:
#Copy of sum_entrpy-unique_combi.csv
Entropy=pd.read_csv('entropy.csv')
# Convert attribute combinations in Entropy DataFrame to frozenset as well

Entropy

Unnamed: 0,Vectors,joint_H,joint_H_n,anon_set
0,AudioContext.baseLatency,2.459223,0.290849,243.806268
1,AudioContext.outputLatency,0.000000,0.000000,0.000000
2,AudioDestinationNode.maxChannelCount,0.229761,0.056211,5033.882353
3,AudioNode.channelCount,0.000000,0.000000,0.000000
4,AudioNode.numberOfInputs,0.000000,0.000000,0.000000
...,...,...,...,...
15234,Navigator.maxTouchPoints|AudioContext.baseLate...,14.045911,0.916522,2.084474
15235,Navigator.userAgent|Navigator.doNotTrack|HTMLC...,13.127296,0.874133,2.580077
15236,Screen.width|Screen.height|Navigator.platform|...,11.354211,0.796308,4.366122
15237,Navigator.platform|Navigator.appName|Screen.wi...,13.966075,0.913919,2.148585


In [None]:
attribute_combinations['attribute_combinations'] = attribute_combinations['attribute_combinations'].astype(str)
attribute_combinations['attribute_combinations'] = attribute_combinations['attribute_combinations'].str.replace('[\[\]]', '', regex=True)
attribute_combinations['attribute_combinations'] = attribute_combinations['attribute_combinations'].str.replace("'", '', regex=True)


# Convert the 'attribute_combinations' column in attribute_combinations DataFrame to lowercase
attribute_combinations['attribute_combinations'] = attribute_combinations['attribute_combinations'].str.lower()

# Convert the 'attribute_combinations' column in Entropy DataFrame to lowercase
Entropy['Vectors'] = Entropy['Vectors'].str.lower()

# Create a new column in attribute_combinations DataFrame to store the matched entropy values
attribute_combinations['entropy'] = attribute_combinations['attribute_combinations'].apply(
    lambda x: Entropy.loc[Entropy['Vectors'].apply(
        lambda y: set(map(str.strip, y.split('|'))) == set(map(str.strip, x.split(','))))
    ]["joint_H_n"].iloc[0] if not pd.isna(x) and len(Entropy.loc[Entropy['Vectors'].apply(
        lambda y: set(map(str.strip, y.split('|'))) == set(map(str.strip, x.split(','))))
    ]["joint_H_n"]) > 0 else None
)

# Fill the NaN values in the entropy column with 0
attribute_combinations['entropy'] = attribute_combinations['entropy'].fillna(0)



In [None]:

# Save the modified DataFrame as a CSV file
attribute_combinations.to_csv('attribute_combinations_entropyJoint_consent_banner.csv', index=False)

In [None]:
attribute_combinations