# Analyze Trackers

In [7]:
import glob
import pandas as pd 
from collections import Counter
import tldextract
import matplotlib.pyplot as plt


## 1375 trackers analysis

In [62]:
thirdparties = pd.read_csv(
    "../resource/labeled-thirdparties.csv",
    sep="\t",
    names=[
        "domain",
        "registration_org",
        "registration_country",
        "num_embeddings",
        "num_embeddings_javascript",
        "num_embeddings_iframe",
        "num_embeddings_image",
        "num_embeddings_link",
        "category",
        "company",
    ],
)

tracker_list = thirdparties["domain"].to_list()
tracker_list = list(map(lambda x: tldextract.extract(x).domain, tracker_list))


In [63]:
print("all third parties",len(thirdparties))
print("all third parties registration domain",len(set(tracker_list)))

all third parties 1375
all third parties registration domain 1285


### analysis special trackers

In [2]:
# sometimes trackers only occur in some special websites list
# edu_trackers = []
# control_trackers = []

# def get_trackers(row, trackers):
#     domain = row["3p-domain"]
#     url = row["url"]

#     if domain != domain:
#         return []
#     else:
#         trackers_list = domain.split(",")
#         trackers.extend(trackers_list)
#         return trackers_list

# def generate_all_3p(files, task_type, element_type):
#     for file in files:
#         df = pd.read_csv(file)
#         if task_type == "edu":
#             df["trackers"] = df.apply(get_trackers, trackers=edu_trackers, axis=1)
#         else:
#             df["trackers"] = df.apply(get_trackers, trackers=control_trackers, axis=1)
#     if task_type == "edu":
#         tracker_count = Counter(edu_trackers)
#     else:
#         tracker_count = Counter(control_trackers)
#     return tracker_count


In [3]:
# edu_files = glob.glob(f"../dataset_archive/edu_archive_ali_1375*.csv")
# tracker_count_edu = generate_all_3p(edu_files, "edu", element_type = "")
# control_files = glob.glob(f"../dataset_archive/control_archive_ali_1375*.csv")
# tracker_count_control = generate_all_3p(control_files, "control", element_type = "")



In [4]:
# print(len(tracker_count_edu))
# print(len(tracker_count_control))

# set_edu = set(tracker_count_edu)
# set_control = set(tracker_count_control)

# print(len(set_edu))
# print(len(set_control))
# print(len(set_edu & set_control))
# print(len(set_edu | set_control))
# print(len(set_edu - set_control))
# print(set_edu - set_control)
# print(len(set_control - set_edu))
# print(set_control - set_edu)

## whotrackers me

In [8]:
whotracksme = pd.read_csv("../resource/whotracksme_trackers.txt", names=["domain"])
thirdparties = whotracksme
tracker_list = thirdparties["domain"].to_list()
tracker_list = list(map(lambda x: tldextract.extract(x).domain, tracker_list))

In [9]:
print("all third parties",len(thirdparties))
print("all third parties registration domain",len(set(tracker_list)))

all third parties 1285
all third parties registration domain 1091


In [52]:
edu_trackers = []
control_trackers = []
def trackers_count(trackers, web_list):
    """calculate the trackers which occur in how many websites.

    Args:
        trackers (list): all the trackers
        web_list (df): websites in a specific year

    Returns:
        Counter: IDF of the trackers
    """
    trackers_count_dict = Counter()
    try:
        for t in trackers:
            for l in web_list:
                if l != l:
                    continue
                if t in l:
                    if t in trackers_count_dict:
                        trackers_count_dict[t] += 1
                    else:
                        trackers_count_dict[t] = 1
    except Exception as e:
        print(e)

    return trackers_count_dict


def get_trackers(row, trackers):
    domain = row["3p-domain"]
    url = row["url"]

    if domain != domain:
        return []
    else:
        trackers_list = domain.split(",")
        if "" in trackers_list:
            trackers_list.remove("")
        extract_domain = tldextract.extract(url).domain
        if extract_domain in trackers_list:
            trackers_list.remove(extract_domain)
        trackers.extend(trackers_list)
        return trackers_list

def generate_all_3p(files, task_type, element_type):
    for file in files:
        df = pd.read_csv(file)
        if task_type == "edu":
            df["trackers"] = df.apply(get_trackers, trackers=edu_trackers, axis=1)
        else:
            df["trackers"] = df.apply(get_trackers, trackers=control_trackers, axis=1)
    if task_type == "edu":
        tracker_count = Counter(edu_trackers)
    else:
        tracker_count = Counter(control_trackers)
    return tracker_count

In [67]:
edu_files = glob.glob(f"../dataset_archive/edu_archive_ali_exclude_all*.csv")
tracker_count_edu = generate_all_3p(edu_files, "edu", element_type = "exclude")
control_files = glob.glob(f"../dataset_archive/control_archive_ali_exclude_all*.csv")
tracker_count_control = generate_all_3p(control_files, "control", element_type = "exclude")

In [54]:
print(len(tracker_count_edu))
print(len(tracker_count_control))

28577
40222


In [55]:
set_edu = set(tracker_count_edu)
set_control = set(tracker_count_control)


In [70]:
print(len(set_edu))
print(len(set_control))
print(len(set_edu & set_control))
print(len(set_edu - set_control))

28577
40222
4904
23673


In [57]:
len(set_edu | set_control)

63895

In [60]:
tracker_count_edu.most_common(20)

[('google-analytics', 87747),
 ('googleapis', 33598),
 ('google', 31198),
 ('googletagmanager', 22393),
 ('w', 17965),
 ('facebook', 16684),
 ('youtube', 14451),
 ('cloudflare', 7914),
 ('twitter', 6725),
 ('jquery', 6314),
 ('newrelic', 6091),
 ('addthis', 5990),
 ('doubleclick', 5969),
 ('typekit', 5752),
 ('googleadservices', 4467),
 ('amazonaws', 4281),
 ('cloudfront', 4072),
 ('googlesyndication', 3667),
 ('wp', 3036),
 ('yandex', 2892)]

In [61]:
tracker_count_control.most_common(20)

[('google-analytics', 77409),
 ('googleapis', 28385),
 ('google', 25970),
 ('w', 24198),
 ('facebook', 18808),
 ('googletagmanager', 17137),
 ('youtube', 12453),
 ('googlesyndication', 9940),
 ('twitter', 7920),
 ('wp', 7437),
 ('gravatar', 5826),
 ('addthis', 5761),
 ('doubleclick', 5578),
 ('cloudfront', 5537),
 ('jquery', 5265),
 ('typekit', 5245),
 ('cloudflare', 5036),
 ('googleadservices', 4274),
 ('amazonaws', 3854),
 ('newrelic', 3519)]

In [24]:
tracker_count_edu_dict = dict(tracker_count_edu.most_common(100))
tracker_count_edu_dict
df_edu = pd.DataFrame({"tracker":tracker_count_edu.keys(),"count":tracker_count_edu.values()})
df_edu

Unnamed: 0,tracker,count
0,027art,10
1,cnzz,427
2,anquan,14
3,w,17965
4,googleapis,33598
...,...,...
28572,infogamy,1
28573,uradio,1
28574,infoprostir,1
28575,imgnly,1


In [25]:
plt.figure(figsize=(15,10))
df_edu.sort_values(ascending=False,by="count").plot.bar()
plt.xticks(rotation=50)
plt.xlabel("Country of Origin")
plt.ylabel("Number of Wines")
plt.show()

<Figure size 1080x720 with 0 Axes>

KeyboardInterrupt: 