# Analyse the develop of the trackers in edu websites

In [48]:
import pandas as pd 

import glob 

def frame_construct(time):

    filenames = glob.glob('edu_trackers_{}/*.csv'.format(time))
    li = []
    for filename in filenames:
        df = pd.read_csv(filename,index_col=None,header = 0)
        li.append(df)

    frame = pd.concat(li,axis = 0,ignore_index=True)
    return frame

frame_2021 = frame_construct('202105')
print(frame_2021.count())


url         830
trackers    830
dtype: int64


In [54]:
# 载入历史的
frame_2019 = frame_construct('201905')
frame_2020 = frame_construct('202005')
frame_2018 = frame_construct('201805')
frame_2017 = frame_construct('201705')

frame_list = [frame_2017,frame_2018,frame_2019,frame_2020,frame_2021]

len_list = [len(frame) for frame in frame_list]
print(len_list)

[929, 861, 849, 840, 830]


## 数量分析

In [55]:
def count_trackers(row):
    return len(row.split(','))

frame_2017['count_2017'] = frame_2017['trackers'].apply(count_trackers)
frame_2018['count_2018'] = frame_2018['trackers'].apply(count_trackers)
frame_2019['count_2019'] = frame_2019['trackers'].apply(count_trackers)
frame_2020['count_2020'] = frame_2020['trackers'].apply(count_trackers)
frame_2021['count_2021'] = frame_2021['trackers'].apply(count_trackers)

trackers_count = [frame['trackers'].apply(count_trackers).sum() for frame in frame_list]
print(trackers_count)

[3878, 3336, 3151, 3183, 3116]


## 第三方库的标识

In [57]:
df_domain = pd.read_csv("labeled-thirdparties.csv",sep = '\t', names = ['domain','registration_org','registration_country','num_embeddings','num_embeddings_javascript','num_embeddings_iframe','num_embeddings_image','num_embeddings_link','category','company'])
print(df_domain.head())

                  domain registration_org registration_country  \
0   google-analytics.com      Google Inc.                   US   
1  googlesyndication.com      Google Inc.                   US   
2             google.com      Google Inc.                   US   
3         googleapis.com      Google Inc.                   US   
4           facebook.com   Facebook, Inc.                   US   

   num_embeddings  num_embeddings_javascript  num_embeddings_iframe  \
0        11571952                   11571778                      2   
1         4331482                    4306202                   3914   
2         4223810                    3130181                1051507   
3         3085323                    2601946                    733   
4         2146944                     344751                1683414   

   num_embeddings_image  num_embeddings_link     category   company  
0                  2596                  332    Analytics    Google  
1                 40399             

## 数据分析

### 获得每一个trackres中的使用率并逆序排



In [58]:
from collections import Counter

frame_2021['tracker_list_2021'] = frame_2021['trackers'].str.split(',')
frame_2020['tracker_list_2020'] = frame_2020['trackers'].str.split(',')
frame_2019['tracker_list_2019'] = frame_2019['trackers'].str.split(',')
frame_2018['tracker_list_2018'] = frame_2018['trackers'].str.split(',')
frame_2017['tracker_list_2017'] = frame_2017['trackers'].str.split(',')

# 所有的trackers domain

trackers = df_domain['domain'].to_list()

def trackers_count(trackers,web_list):
    trackers_count_dict = Counter()
    for t in trackers:
        for l in web_list:
            if t in l:
                if t in trackers_count_dict:
                    trackers_count_dict[t] += 1
                else:
                    trackers_count_dict[t] = 1

    return trackers_count_dict

trackers_2021 = trackers_count(trackers, frame_2021['tracker_list_2021'])
trackers_2021_most_100 = trackers_2021.most_common(100)

trackers_2020 = trackers_count(trackers, frame_2020['tracker_list_2020'])
trackers_2020_most_100 = trackers_2020.most_common(100)

trackers_2019 = trackers_count(trackers, frame_2019['tracker_list_2019'])
trackers_2019_most_100 = trackers_2019.most_common(100)

trackers_2018 = trackers_count(trackers, frame_2018['tracker_list_2018'])
trackers_2018_most_100 = trackers_2018.most_common(100)

trackers_2017 = trackers_count(trackers, frame_2017['tracker_list_2017'])
trackers_2017_most_100 = trackers_2017.most_common(100)



In [59]:
# 计算比例

def trackers_crawl_rate(trackers, list_len):
    print("len df is:{}".format(list_len))
    tracker, tracker_count = zip(*trackers)
    tracker_rate = list(map(lambda x:x[1] / list_len, trackers))

    df = pd.DataFrame({"tracker":tracker,"tracker_count":tracker_count,'tracker_rate':tracker_rate})
    return df


df_2021 = trackers_crawl_rate(trackers_2021_most_100,len(frame_2021))
print(df_2021)

df_2020 = trackers_crawl_rate(trackers_2020_most_100,len(frame_2020))
print(df_2020)
# df_2021.to_csv("trackers_2021.csv",index = None)

df_2019 = trackers_crawl_rate(trackers_2019_most_100,len(frame_2019))
print(df_2019)

df_2018 = trackers_crawl_rate(trackers_2018_most_100,len(frame_2018))
print(df_2018)

df_2017 = trackers_crawl_rate(trackers_2017_most_100,len(frame_2017))
print(df_2017)
# df_2019.to_csv("trackers_2019_csv",index = None)


len df is:830
                  tracker  tracker_count  tracker_rate
0            facebook.com            453      0.545783
1             twitter.com            432      0.520482
2             youtube.com            282      0.339759
3              google.com            266      0.320482
4           instagram.com            200      0.240964
5            linkedin.com            198      0.238554
6          googleapis.com            107      0.128916
7          cloudflare.com             75      0.090361
8   googlesyndication.com             64      0.077108
9     creativecommons.org             51      0.061446
10             jquery.com             50      0.060241
11            addthis.com             49      0.059036
12         cloudfront.net             49      0.059036
13          pinterest.com             49      0.059036
14              apple.com             41      0.049398
15        doubleclick.net             34      0.040964
16                nih.gov             27      0.032

In [62]:
### 判断一下各个品类的trackers在

def categorical_trackers(group, trackers_year):
    count = 0
    for domain in group['domain']:
        count += trackers_year[domain]
    return count
# return pd.Series({'tracker_list':group['domain'].to_list()})

print(df_domain[['category','domain']].groupby('category').apply(categorical_trackers,trackers_2021).reset_index())
print(df_domain[['category','domain']].groupby('category').apply(categorical_trackers,trackers_2020).reset_index())
print(df_domain[['category','domain']].groupby('category').apply(categorical_trackers,trackers_2019).reset_index())
print(df_domain[['category','domain']].groupby('category').apply(categorical_trackers,trackers_2018).reset_index())
print(df_domain[['category','domain']].groupby('category').apply(categorical_trackers,trackers_2017).reset_index())

                 category     0
0                       #  1651
1             Advertising  1083
2               Analytics    61
3   Analytics,Advertising     2
4                  Beacon    13
5                 Captcha     0
6         ContentDelivery   146
7           DomainParking     0
8          EmailMarketing     3
9             FormBuilder     0
10                Hosting     0
11            ImageHoster    21
12          LinkShortener    30
13                Malware     0
14                Payment     0
15                Toolbar     0
16                Weather     0
17            WebServices     0
18         WebsiteBuilder     2
19                 Widget   104
                 category     0
0                       #  1657
1             Advertising  1128
2               Analytics    61
3   Analytics,Advertising     1
4                  Beacon    17
5                 Captcha     1
6         ContentDelivery   127
7           DomainParking     0
8          EmailMarketing     1
9       

In [22]:
# 计算各个网站的glocality

def glocality_cal(l, trackers_count):
    T_w = len(l)
    idf_sim = sum([1 / (trackers_count[t]) for t in l])
    return T_w * idf_sim


df_merge['glocality_2021'] = df_merge['tracker_list_2021'].apply(glocality_cal,trackers_count = trackers_2021)
print(df_merge['glocality_2021'])


0       2.796898
1       0.162243
2       7.080357
3       3.078159
4       0.156015
5       0.151157
6       1.000000
7       0.685885
8       0.333333
9       1.044314
10      0.028571
11      1.609576
12      3.207634
13      6.194691
14      6.886840
15      0.333333
16     22.958795
17      4.000000
18      0.162243
19      0.028571
20      0.015152
21      1.000000
22     68.103929
23      8.976405
24     16.914364
25      3.518042
26      0.136072
27      0.035714
28      0.685885
29      0.086443
         ...    
381     4.207234
382     2.202169
383     2.621801
384     3.434790
385     0.572145
386     0.675357
387    12.648948
388     1.071429
389     0.016641
390     3.422630
391     0.052637
392     9.354610
393     0.296834
394     0.016641
395     0.866795
396     1.005414
397     0.006711
398     0.183811
399    18.210693
400     0.151157
401     0.037037
402     2.092091
403     0.015152
404     0.045095
405     0.274145
406     0.986629
407     0.310127
408     2.1381