In [2]:
import pandas as pd
import os
from sklearn.metrics import confusion_matrix
import json
from urllib.parse import urlparse, urljoin

In [3]:
#Read labels from folder
LABEL_FOLDER = "labels_run5"
df = pd.DataFrame()
fnames = os.listdir(LABEL_FOLDER)
for fname in fnames:
    fpath = os.path.join(LABEL_FOLDER, fname)
    df_file = pd.read_csv(fpath)
    df = df.append(df_file)

In [4]:
#Logging to pickle format
df.to_pickle("oldlabels.pkl")

# Label analysis

We perform a comparison of the varios methods used for labelling:

1. Setter scripts
2. Cookiepedia
3. Our method (combination)

First, we get some overall statistics on the dataset. Then, we remove null values and get their distributions.

In [5]:
print("Number of sites:", len(df['visit_id'].unique()))
df = df[df['name'].notnull()]
print("Number of cookies:", len(df))

Number of sites: 8072
Number of cookies: 130841


In [6]:
print("Non-null labels", len(df))
print("Non-null Setter labels:", len(df[df['setter_label'].notnull()]), len(df[df['setter_label'].notnull()])/len(df) * 100, "%")
print("Non-null Cookiepedia labels:", len(df[df['declared_label'].notnull()]), len(df[df['declared_label'].notnull()])/len(df) * 100, "%")

Non-null labels 130841
Non-null Setter labels: 125793 96.14188213174769 %
Non-null Cookiepedia labels: 2875 2.197323468943221 %


We then look at the label distribution for each method. 

In [7]:
#df['setter_label'] = df['setter_label'].apply(lambda x: str(x))
df = df[(df['setter'].notnull()) & (df['domain'].notnull())]

In [8]:
print("Setter label distribution")
print(df[df['setter_label'].notnull()]['setter_label'].value_counts())
print("\nSetter label distribution (%)")
print(df[df['setter_label'].notnull()]['setter_label'].value_counts()/len(df[df['setter_label'].notnull()])*100)
print("\nCookiepedia label distribution")
print(df[df['declared_label'].notnull()]['declared_label'].value_counts())
print("\nCookiepedia label distribution (%)")
print(df[df['declared_label'].notnull()]['declared_label'].value_counts()/len(df[df['declared_label'].notnull()])*100)
print("\nAll label distribution")
print(df['label'].value_counts())
print("\nAll label distribution (%)")
print(df['label'].value_counts()/len(df['label'])*100)

Setter label distribution
True     84419
False    29584
Name: setter_label, dtype: int64

Setter label distribution (%)
True     74.049806
False    25.950194
Name: setter_label, dtype: float64

Cookiepedia label distribution
3.0    967
2.0    886
0.0    484
1.0    407
Name: declared_label, dtype: int64

Cookiepedia label distribution (%)
3.0    35.240525
2.0    32.288630
0.0    17.638484
1.0    14.832362
Name: declared_label, dtype: float64

All label distribution
Unknown     83762
Negative    29274
Positive      967
Name: label, dtype: int64

All label distribution (%)
Unknown     73.473505
Negative    25.678272
Positive     0.848223
Name: label, dtype: float64


In [9]:
df_analyze = df[(df['setter_label'].notnull()) & (df['declared_label'].notnull())]
df_analyze['setter_label'] = df_analyze['setter_label'].apply(lambda x: str(x))
print("Number of non-null:", len(df_analyze))
print(df_analyze['label'].value_counts())
print(df_analyze['label'].value_counts()/len(df_analyze)*100)
print(df['label'].value_counts()/len(df)*100)
df_analyze['declared_label_process'] = df_analyze['declared_label'].apply(lambda x: 'True' if x==3 else 'False')
mat = confusion_matrix(df_analyze['setter_label'], df_analyze['declared_label_process'], labels=['True', 'False'])

Number of non-null: 2744
Unknown     1054
Positive     967
Negative     723
Name: label, dtype: int64
Unknown     38.411079
Positive    35.240525
Negative    26.348397
Name: label, dtype: float64
Unknown     73.473505
Negative    25.678272
Positive     0.848223
Name: label, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analyze['setter_label'] = df_analyze['setter_label'].apply(lambda x: str(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analyze['declared_label_process'] = df_analyze['declared_label'].apply(lambda x: 'True' if x==3 else 'False')


Distribution of predictions by both methods. 

- We see that ~43% of the labels have agreement with both methods. 
- Large fraction (~55%) is predicted true by setter but false by Cookiepedia (19% is actually unknown). So, we have self declared non-ads that would be marked as ad by setter method.
- A very small fraction (~1%) is predicted by setter as non-ATS, but Cookiepedia as ATS. This is an actual conflict for us since we rely on these labels for the final label. We opt to give precedence to the Cookiepedia label.

In [10]:
print("Both predicted ATS:", mat[0][0], mat[0][0]/len(df_analyze)*100)
print("Both predicted Non-ATS:", mat[1][1], mat[1][1]/len(df_analyze)*100)
unknown_cookiepedia = df_analyze[(df_analyze['setter_label'] == 'True')]
print("Setter predicted ATS, Cookiepedia predicted Non-ATS:", mat[0][1], mat[0][1]/len(df_analyze)*100, 
      "Of which Unknown category:", len(unknown_cookiepedia), len(unknown_cookiepedia)/len(df_analyze)*100)
print("Setter predicted Non-ATS, Cookiepedia predicted ATS:", mat[1][0], mat[1][0]/len(df_analyze)*100)

Both predicted ATS: 657 23.943148688046648
Both predicted Non-ATS: 723 26.348396501457728
Setter predicted ATS, Cookiepedia predicted Non-ATS: 1054 38.411078717201164 Of which Unknown category: 1711 62.35422740524781
Setter predicted Non-ATS, Cookiepedia predicted ATS: 310 11.29737609329446


In [11]:
def check_same_setter(df):
    
    cookiepedia_labels = df['declared_label'].unique()
    cookiepedia_labels = [str(x) for x in cookiepedia_labels]
    categories = "||".join(cookiepedia_labels)
    df_proc = pd.DataFrame({'num_labels' : [len(cookiepedia_labels)], 'categories' : [categories]})
    return df_proc

df_samesetter = df_analyze.groupby(['visit_id', 'setter'], as_index=False).apply(check_same_setter)

In [12]:
print("Same setter, multiple labels:", len(df_samesetter[df_samesetter['num_labels'] > 1]), 
      len(df_samesetter[df_samesetter['num_labels'] > 1])/len(df_samesetter) * 100)

Same setter, multiple labels: 176 14.285714285714285


In [111]:
df_samesetter[df_samesetter['num_labels'] == 2]['categories'].unique()

array(['2.0||3.0', '0.0||3.0', '3.0||1.0', '2.0||1.0', '2.0||0.0',
       '0.0||1.0', '3.0||2.0', '1.0||3.0', '1.0||2.0', '1.0||0.0',
       '3.0||0.0', '0.0||2.0'], dtype=object)

## Resolve conflicts

In [13]:
#check common scripts
#Input is common.json -- which you get from find_common_scripts.py
with open("common.json") as f:
    common_data = json.loads(f.read())

In [14]:
common_data_rev = {}
for k, v in common_data.items():
    for url in v:
        common_data_rev[url] = k

In [23]:
# Get the hash of the script's content
df['setter_hash'] = df['setter'].apply(lambda x: common_data_rev.get(x))

In [24]:
df_setterhash = df[df['setter_hash'].notnull()]

In [37]:
def check_conflict(df):
    
    labels = df['label'].unique()
    #df_common = pd.DataFrame(columns=['conflict', 'num', 'setter_hash', 'name'])
    data = ('NA', len(df), df['setter_hash'].iloc[0], df['split_name'].iloc[0])
    if len(labels) > 1:
        #print(len(df))
        if len(labels) == 3:
            tag = "PNU"
        else:
            if ('Positive' in labels) and ('Negative' in labels):
                tag = "PN"
            if ('Positive' in labels) and ('Unknown' in labels):
                tag = "PU"
            if ('Unknown' in labels) and ('Negative' in labels):
                tag = "NU" 
        #df_common = pd.DataFrame([[tag, len(df), df['setter_hash'].iloc[0], df['name'].iloc[0]]], 
        #                         columns=['conflict', 'num', 'setter_hash', 'name'])
        data = (tag, len(df), df['setter_hash'].iloc[0], df['split_name'].iloc[0])
    return data

#Get split name (just cookie name without domain)
# Check conflicts -- all cookies with same name and same setter hash should have one label
df_setterhash['split_name'] = df_setterhash['name'].apply(lambda x : x.split('|$$|')[0])
df_common = df_setterhash.groupby(['setter_hash', 'split_name']).apply(check_conflict).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_setterhash['split_name'] = df_setterhash['name'].apply(lambda x : x.split('|$$|')[0])


In [38]:
col_data = df_common[0].tolist()
df_conflict = pd.DataFrame(col_data, columns=['conflict', 'num', 'setter_hash', 'split_name'])

In [39]:
df_conflict

Unnamed: 0,conflict,num,setter_hash,split_name
0,,1,0000e233d9267d6cd789c2a02da90577b0683152,_cc_aud
1,,1,0000e233d9267d6cd789c2a02da90577b0683152,_cc_cc
2,,1,0000e233d9267d6cd789c2a02da90577b0683152,_cc_id
3,,1,0000e233d9267d6cd789c2a02da90577b0683152,lotame_domain_check
4,,4,0002262b6660ce3ab3818580518392583bd9376f,cookietest
...,...,...,...,...
43071,NU,5,fff5c3eb7242f7a5881562864635c2e4080c7338,wickedfu_null
43072,,1,fffbaf93db0068475473c759caffb3d294adaec2,_shopify_y
43073,,1,fffbaf93db0068475473c759caffb3d294adaec2,_y
43074,,1,fffbaf93db0068475473c759caffb3d294adaec2,cart_currency


In [40]:
df_change = df_setterhash.merge(df_conflict, on=['split_name', 'setter_hash'])

In [41]:
def change_label(row):
    cur_label = row['label']
    conflict = row['conflict']
    if conflict == 'PU':
        return 'Positive'
    elif conflict == 'PN':
        return 'Positive'
    elif conflict == 'PNU':
        return 'Positive'
    elif conflict == 'NU':
        return 'Unknown'
    return cur_label

#Update labels based on conflict label
df_change['new_label'] = df_change.apply(change_label, axis=1)

In [42]:
df_change[df_change['label'] != df_change['new_label']]

Unnamed: 0.1,Unnamed: 0,visit_id,name,setter,top_level_domain,setter_domain,resource_type,setter_label,domain,declared_label,label,setter_hash,split_name,conflict,num,new_label
0,0,1.806962e+13,_chartbeat2|$$|infobae.com,https://static.chartbeat.com/js/chartbeat.js,infobae.com,chartbeat.com,script,True,infobae.com,,Unknown,6671bac67dd759b8a8e6e71fdfe05edda4555791,_chartbeat2,PU,78,Positive
1,0,1.754110e+15,_chartbeat2|$$|hobbyconsolas.com,https://static.chartbeat.com/js/chartbeat.js,hobbyconsolas.com,chartbeat.com,script,True,hobbyconsolas.com,,Unknown,6671bac67dd759b8a8e6e71fdfe05edda4555791,_chartbeat2,PU,78,Positive
2,0,4.381966e+15,_chartbeat2|$$|delivery.com,https://static.chartbeat.com/js/chartbeat.js,delivery.com,chartbeat.com,script,True,delivery.com,,Unknown,6671bac67dd759b8a8e6e71fdfe05edda4555791,_chartbeat2,PU,78,Positive
3,3,6.139050e+15,_chartbeat2|$$|eleconomistaamerica.co,https://static.chartbeat.com/js/chartbeat.js,eleconomistaamerica.co,chartbeat.com,script,True,eleconomistaamerica.co,,Unknown,6671bac67dd759b8a8e6e71fdfe05edda4555791,_chartbeat2,PU,78,Positive
4,0,6.334911e+15,_chartbeat2|$$|ritholtz.com,https://static.chartbeat.com/js/chartbeat.js,ritholtz.com,chartbeat.com,script,True,ritholtz.com,,Unknown,6671bac67dd759b8a8e6e71fdfe05edda4555791,_chartbeat2,PU,78,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97493,47,8.033221e+15,localization|$$|flickr.com,https://www.flickr.com/services/developer,flickr.com,flickr.com,main_frame,False,flickr.com,1.0,Negative,4e657ca41aa206100c2264dd46920e0932f3bbbc,localization,PN,2,Positive
98505,3,9.291492e+14,ckbk|$$|resetdigital.co,https://meta.resetdigital.co/smart?px=1000131&...,resetdigital.co,resetdigital.co,image,False,resetdigital.co,,Negative,0f4e929dd5bb2564f7ab9c76338e04e292a42ace,ckbk,PN,2,Positive
103867,0,5.194194e+15,crumb|$$|squarespace.com,https://blackbird-crane-d63m.squarespace.com/,squarespace.com,squarespace.com,main_frame,False,squarespace.com,0.0,Negative,7b332a8afb7aa223d64a614f33990e1790ed1a4c,crumb,PN,4,Positive
103869,2,5.194194e+15,crumb|$$|squarespace.com,https://blackbird-crane-d63m.squarespace.com/,squarespace.com,squarespace.com,main_frame,False,squarespace.com,1.0,Negative,7b332a8afb7aa223d64a614f33990e1790ed1a4c,crumb,PN,4,Positive


In [43]:
#Get all entries without a setter hash
df_nullsh = df[~df['setter_hash'].notnull()]

In [44]:
df_change = df_change.drop(columns=['conflict', 'num'])

In [45]:
df_nullsh['new_label'] = df_nullsh['label'].apply(lambda x: x)
df_combined = pd.concat([df_change, df_nullsh])
df_combined = df_combined.drop(columns=['label'])
df_combined = df_combined.rename(columns={'new_label' : 'label'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nullsh['new_label'] = df_nullsh['label'].apply(lambda x: x)


In [46]:
df_combined.to_pickle("newlabels_test.pkl")