In [1]:
import pandas as pd 

import glob 

def frame_construct(time):

    filenames = glob.glob('global_file_list_alexa_{}/*.csv'.format(time))
    li = []
    for filename in filenames:
        df = pd.read_csv(filename,index_col=None,header = 0)
        li.append(df)

    frame = pd.concat(li,axis = 0,ignore_index=True)
    return frame

frame_202105 = frame_construct("202105")
frame_202105['tracker_list'] = frame_202105.trackers.str.split(',')




In [2]:
# 第三库标识

import numpy as np
df_domain_third_party = pd.read_csv("labeled-thirdparties.csv",sep = '\t', names = ['domain','registration_org','registration_country','num_embeddings','num_embeddings_javascript','num_embeddings_iframe','num_embeddings_image','num_embeddings_link','category','company'])

df_domain = pd.read_csv("trackers_domain.csv")

df_domain = df_domain.merge(df_domain_third_party,how = "left",on = "domain")[['domain',"registration_country","category"]]
print(df_domain.head())

from collections import Counter
trackers = df_domain['domain'].to_list()

# 计算glocality

def trackers_count(trackers,web_list):
    trackers_count_dict = Counter()
    try:
        for t in trackers:
            for e,l in enumerate(web_list):
                if t in l:
                    if t in trackers_count_dict:
                        trackers_count_dict[t] += 1
                    else:
                        trackers_count_dict[t] = 1
    except Exception as e:
        print(e)
    return trackers_count_dict

def get_trackers_count(frame):
    """
    获得每个frame中trackers的字典
        :param frame: 
    """
    frame['trackers_list'] = frame['trackers'].str.split(',')

    trackers_count_dict = trackers_count(trackers, frame['trackers_list'])

    return trackers_count_dict
# 获得世界级别的tracker count

def get_glocality(frame):
    tracker_count = get_trackers_count(frame)
    def glocality_cal(l, tracker_count):
        T_w = len(l) # set of
        idf_sim = sum([1 / (tracker_count[t]) for t in l])
        return 1 / T_w * idf_sim
    def glocality_index_cal(frame):
        value = []
        for _,row in frame.iterrows():
            url = row['url']
            trackers_list = row['tracker_list']
            v = glocality_cal(trackers_list, tracker_count)
            value.append(v)
        return np.mean(value)

    return glocality_index_cal(frame)



               domain registration_country   category
0  123contactform.com                   RO     Widget
1             2o7.net                   US  Analytics
2           4stats.de                    #     Beacon
3               4u.pl                    #  Analytics
4               51.la                   CN  Analytics


## 获得每个国家的domain list

获得国家的domain list之后我们重新计算每个国家的glocality

In [3]:
filename = glob.glob("/Users/zhansu/program/code/phd_privacy_lost/webXray/page_lists/*.txt")

countries = []
glocalities = []
for file in filename:
    country_code = file.split("/")[-1].split(".")[0].split('_')[-1]
    web_list = open(file).read().split("\n")
    web_list = list(map(lambda x:x.split("//")[-1],web_list))
    # print(web_list)
    frame = frame_202105[frame_202105['url'].isin(web_list)]
    if len(frame) > 0:
        countries.append(country_code)
        glocalities.append(get_glocality(frame))

df_glocality = pd.DataFrame({"country":countries,"glocality":glocalities})
df_glocality.columns = ['alpha_2',"glocality"]
print(df_glocality.sort_values(by = ['glocality'],ascending =False))
    # print(web_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


   alpha_2  glocality
14      RW   0.073052
85      BR   0.072118
84      BS   0.065000
3       KR   0.064534
78      ZW   0.062742
..     ...        ...
82      NP   0.029388
93      KZ   0.024174
9       GR   0.023896
26      RU   0.022735
51      BY   0.022433

[116 rows x 2 columns]


In [4]:
import pycountry

def get_three_iso_alpha(row):
    a2 = row['alpha_2']
    c = pycountry.countries.get(alpha_2 = a2.upper())
    return c.alpha_3

def get_name(row):
    a2 = row['alpha_2']
    c = pycountry.countries.get(alpha_2 = a2.upper())
    return c.name
# print(len(pycountry.countries))

df_glocality['alpha_3'] = df_glocality.apply(get_three_iso_alpha,axis = 1)
df_glocality['name'] = df_glocality.apply(get_name,axis = 1)
df_glocality = df_glocality.sort_values(by = ['glocality'],ascending=False)
print(df_glocality.head())
df_glocality.to_csv("glocality_country.csv",index = None)


   alpha_2  glocality alpha_3                name
14      RW   0.073052     RWA              Rwanda
85      BR   0.072118     BRA              Brazil
84      BS   0.065000     BHS             Bahamas
3       KR   0.064534     KOR  Korea, Republic of
78      ZW   0.062742     ZWE            Zimbabwe


In [11]:
import pandas as pd
df_iii = pd.read_csv("iii_world.csv")

df_merge = df_iii.merge(df_glocality, how = "inner", on  = "alpha_2")
df_merge.to_csv("glocality_country_merge.csv",index=None)

In [14]:
import plotly.express as px

fig = px.line(x = df_merge['rank'],y = df_merge['score'])

fig.show()

fig_2 = px.line(x = df_merge['rank'],y =  np.log(1 / df_merge['glocality']))

fig_2.show()