# Notebook for the Evolution of trackers.

In [None]:
# compute the mean change of different trackers. 
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt
df_control = pd.read_csv("../dataset_archive/frame_control_count_who.csv",sep = '\t')
df_edu = pd.read_csv("../dataset_archive/frame_edu_count_who.csv",sep = '\t')

SMALL_SIZE = 12
MEDIUM_SIZE = 10
BIGGER_SIZE = 12

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


In [None]:
df_control_des = df_control.describe()
df_edu_des = df_edu.describe()
df_edu_des

In [None]:
df_control_des

In [None]:
# df_edu_des['10_years_increased'] = (df_edu_des['tracker_2021'] - df_edu_des['tracker_2012']) / df_edu_des['tracker_2012']
# df_edu_des

In [None]:
# df_control_des['10_years_increased'] = (df_control_des['tracker_2021'] - df_control_des['tracker_2012']) / df_control_des['tracker_2012']
# df_control_des

# 2018 decrease

In [None]:
# df_edu_des['18_decreased'] = (df_edu_des['tracker_2021'] - df_edu_des['tracker_2018']) / df_edu_des['tracker_2018']
# df_edu_des

In [None]:
# df_control_des['18_decreased'] = (df_control_des['tracker_2021'] - df_control_des['tracker_2018']) / df_control_des['tracker_2018']
# df_control_des

In [None]:
# avg number change

   
Year = list(range(2012,2022))

  
plt.plot(Year, df_edu_des.loc['mean'], color='red', marker='o',label = 'edu')
plt.plot(Year, df_control_des.loc['mean'], color='blue', marker='+',label = "non-edu")

# plt.title('Average number of trackers', fontsize=14)
# plt.xlabel('Year', fontsize=14)
plt.ylabel('Average number of trackers', fontsize=14)
plt.axvline(x = 2018, color = 'green', linestyle = ':')
plt.yticks(np.arange(0, 2.8, step=0.3))
plt.legend()
plt.annotate('2.338', xy=(2018, 2.55),
             bbox=dict( boxstyle='square',  fc='blue'),
             arrowprops=dict(facecolor='steelblue', shrink=0.01), fontsize=10, color='white')
plt.annotate('2.443', xy=(2020, 2.6),
             bbox=dict(boxstyle='square', fc='red'),
             arrowprops=dict(facecolor='steelblue', shrink=0.01), fontsize=10, color='white')

plt.annotate('GDPR rules', xy=(2018, 0.3),
             bbox=dict(boxstyle='square', fc='firebrick'),
             arrowprops=dict(facecolor='steelblue', shrink=0.01), fontsize=10, color='white')
plt.grid(alpha = 0.5)
plt.savefig("../images/section4/avg_number.png",dpi = 200)
plt.show()


In [None]:
# max

plt.plot(Year, df_edu_des.loc['max'], color='red', marker='o',label = 'edu')
plt.plot(Year, df_control_des.loc['max'], color='blue', marker='+',label = "non-edu")
plt.xlabel('Year', fontsize=14)
plt.ylabel('Max number of trackers', fontsize=14)
plt.legend()
plt.savefig("../images/section4/max_number.png",dpi = 200)

In [None]:
# std 
plt.plot(Year, df_edu_des.loc['std'], color='red', marker='o',label = 'edu')
plt.plot(Year, df_control_des.loc['std'], color='blue', marker='+',label = "non-edu")
plt.xlabel('Year', fontsize=14)
plt.ylabel('std of trackers number', fontsize=14)
plt.legend()
plt.savefig("../images/section4/max_number.png",dpi = 200)

## Analyze the develop of tracker rate

In [None]:
df_rate_edu = pd.read_csv("../dataset_archive/df_rate_merge_edu_whoexclude.csv").fillna(0)
df_rate_control = pd.read_csv("../dataset_archive/df_rate_merge_base_whoexclude.csv").fillna(0)

df_rate_edu.head()

df_rate_edu.loc[df_rate_edu.trackers == 'cloudflare']
df_rate_control.loc[df_rate_control.trackers == "cloudflare"]

In [None]:
tracker_list = []
for year in range(2012, 2022):
    list_e = df_rate_edu.sort_values(by = str(year),ascending = False).head(10)['trackers'].to_list()
    print(list_e)
    tracker_list.extend(df_rate_edu.sort_values(by = str(year),ascending = False).head(10)['trackers'].to_list())

set(tracker_list)

In [None]:
# get the top 10 for each year


In [None]:
# draw figure
years = list(range(2012,2022))
years

# Heat map of the tracker number evoluation 

In [None]:
import seaborn as sns

tracker_names_list = list(map(str,list(range(2012,2022))))
print(tracker_names_list)
df_rate_edu_draw = df_rate_edu[:10]
plt.figure(figsize=(12,10), dpi= 200)
sns.heatmap(df_rate_edu_draw[tracker_names_list], xticklabels=range(2012,2022), yticklabels=df_rate_edu_draw.trackers, cmap='BrBG', center=0, annot=True)

# Decorations
# plt.title('Different Categorical Evolution', fontsize=22)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.show()

In [None]:
for trackers in ['google-analytics','google','facebook','twitter','linkedin']:
    rate_edu_l = df_rate_edu.loc[df_rate_edu.trackers == trackers].values.flatten().tolist()
    plt.plot(years,rate_edu_l[1:11],marker='o',label = trackers, linestyle=':',linewidth = 2)
plt.xlabel('Year', fontsize=14)
plt.ylabel('tracker rate in edu websites', fontsize=14)
plt.yticks(np.arange(0, 0.7, step=0.1))
plt.legend()
plt.savefig("../images/section4/tracker_rate_change_edu.png",dpi = 200)

In [None]:
for trackers in ['google-analytics','google','facebook','twitter','linkedin']:
    rate_control_l = df_rate_control.loc[df_rate_control.trackers == trackers].values.flatten().tolist()
    plt.plot(years,rate_control_l[1:11],marker='+',label = trackers,linestyle=':',linewidth = 2)
plt.xlabel('Year', fontsize=14)
plt.ylabel('tracker rate in non-edu websites', fontsize=14)
plt.yticks(np.arange(0, 0.7, step=0.1))
plt.savefig("../images/section4/tracker_rate_change_non-edu.png",dpi = 200)
plt.legend()

In [None]:

color_map = {"facebook":"red","twitter":"blue","linkedin":"green","google":"orange","google-analytics":"green"}

for trackers in ['facebook','twitter']:
    rate_edu_l = df_rate_edu.loc[df_rate_edu.trackers == trackers].values.flatten().tolist()
    rate_control_l = df_rate_control.loc[df_rate_control.trackers == trackers].values.flatten().tolist()
    plt.plot(years,rate_edu_l[1:11],marker='o',label = f'{trackers}(edu)',linewidth = 2,color =color_map[trackers] )
    plt.plot(years,rate_control_l[1:],marker='o',label = f'{trackers}(non-edu)',linestyle=':',linewidth = 2,color =color_map[trackers])
# plt.xlabel('Year', fontsize=14) 
plt.ylabel('Usage rate of trackers', fontsize=14)
plt.yticks(np.arange(0, 0.3, step=0.1))
plt.legend()
plt.grid(alpha=0.5)
plt.savefig("../images/section4/usage_rate_trackers_comparision.png",dpi = 600)

In [None]:
# color_map['twimg'] = "red"
for trackers in ['google','google-analytics']:
    rate_edu_l = df_rate_edu.loc[df_rate_edu.trackers == trackers].values.flatten().tolist()
    rate_control_l = df_rate_control.loc[df_rate_control.trackers == trackers].values.flatten().tolist()
    plt.plot(years,rate_edu_l[1:],marker='o',label = f'{trackers}(edu)',linewidth = 2,color =color_map[trackers] )
    plt.plot(years,rate_control_l[1:],marker='o',label = f'{trackers}(non-edu)',linestyle=':',linewidth = 2,color =color_map[trackers])
# plt.xlabel('Year', fontsize=14) 
plt.ylabel('Usage rate of trackers', fontsize=14)
plt.yticks(np.arange(0, 0.6, step=0.1))
plt.legend()
plt.grid(alpha=0.5)
plt.savefig("../images/section4/usage_rate_trackers_comparision_2.png",dpi = 600)

In [None]:
df_rate_edu[:10]

In [None]:
df_rate_control[:10]

# plot the increase rate in the last 10 years

In [None]:

# df_rate_edu['10_year_change'] = (df_rate_edu['2021'] - df_rate_edu['2012']) / df_rate_edu['2012']
# df_rate_control['10_year_change'] = (df_rate_control['2021'] - df_rate_control['2012']) / df_rate_control['2012']

# # creat a dictionary
# edu_k_v = dict(zip(df_rate_edu['trackers'].to_list(),df_rate_edu['10_year_change'].to_list()))
# control_k_v = dict(zip(df_rate_control['trackers'].to_list(), df_rate_control['10_year_change'].to_list()))

# fig, ax = plt.subplots(figsize=(16,10), facecolor='white', dpi= 80)


# x = df_rate_edu['trackers'][:10].to_list()
# y_edu = [edu_k_v[key] for key in x]
# y_control = [control_k_v[key]for key in x]

# bar_width = 0.35
# # ax.rc("font", size="25")

# rects1 = ax.barh(x, y_edu, label="educational websites", color="red")
# rects2 = ax.barh(x, y_control , label="non-educationl websites", color="blue", left = y_edu)
# plt.gca().xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1, decimals=1))

# ax.bar_label(rects1, padding=3)
# ax.bar_label(rects2, padding=3)


# # plt.xticks(rotation=45)
# ax.legend(fontsize = 20)
# # plt.savefig(
# #     "images/{}_rate_{}{}.png".format(draw_type, tracker_type, element_type),bbox_inches='tight', dpi=200
# # )

# ax.grid(alpha=0.5)
# plt.savefig("../images/section4/usage_rate_trackers_top_10.png",dpi = 600)
# plt.show()


In [None]:
df_rate_edu['10_year_change'] = (df_rate_edu['2021'] - df_rate_edu['2012']) / df_rate_edu['2012']
df_rate_control['10_year_change'] = (df_rate_control['2021'] - df_rate_control['2012']) / df_rate_control['2012']
df_rate_edu[['trackers','2012','2021','10_year_change']].head(30)

In [None]:
# creat a dictionary
edu_k_v = dict(zip(df_rate_edu['trackers'].to_list(),df_rate_edu['10_year_change'].to_list()))
control_k_v = dict(zip(df_rate_control['trackers'].to_list(), df_rate_control['10_year_change'].to_list()))
labels = df_rate_edu['trackers'][:10].to_list()
# labels = list(set(tracker_list))

y_edu = {key: edu_k_v[key] for key in labels}
y_control = {key: control_k_v[key]for key in labels}
y_edu = dict(sorted(y_edu.items(), key=lambda item: item[1],reverse=False))
y_edu


In [None]:
y_control

In [None]:
labels = y_edu.keys()
y_edu_draw = y_edu.values()
y_control_draw = [control_k_v[key] for key in labels]
print(labels)

In [None]:
print(y_edu_draw)

In [None]:
print(y_control_draw)

In [None]:

from matplotlib import ticker

import numpy as np 



# y_edu_labels = [f"{100*round(edu_k_v[key],2)}%" for key in labels]
# y_control_labels = [f"{100*round(control_k_v[key],2)}%" for key in labels]
# y_control_labels[1] = '29.0%'


x = np.arange(len(labels))  # the label locations
width = 0.4  # the width of the bars

fig, ax = plt.subplots(figsize=(25,12), facecolor='white', dpi= 200)
rects1 = ax.barh(x - width/2, y_edu_draw, width,label='educational websites', color = "red",linewidth = 2,edgecolor='black')
rects2 = ax.barh(x + width/2, y_control_draw, width, label='non-educational websites', color = "blue", linewidth = 2,edgecolor='black')



plt.gca().xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1, decimals=1))


# Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_title('Scores by group and gender')
ax.set_yticks(x, labels,fontsize = 40)


ax.legend(fontsize = 40)
# ax.bar_label(rects1, padding=0, labels = y_edu_labels,fmt='%.2f',fontsize = 20)
# ax.bar_label(rects2, padding=0, labels = y_control_labels,fmt='%.2f',fontsize = 20)
plt.rc('xtick', labelsize=40) 
fig.tight_layout()
   # fontsize of the tick labels

plt.show()

In [None]:
df_rate_edu[['trackers','10_year_change',"2012","2021"]][:10]


In [None]:
df_rate_control[['trackers','10_year_change',"2012","2021"]][:20]