In [1]:
import sys
sys.path.append('../..')

import pandas as pd
from pandarallel import pandarallel

from helpers.utils import (
    clean_dataset,
    get_ps1_or_host
)

In [2]:
file_path = '../../output/data_raw/2024-01-24_SF_attr.pkl'

df = pd.read_pickle(file_path)

In [3]:
pandarallel.initialize()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
# selected-only pages

file_path = "../../helpers/intersection"  

with open(file_path, "r") as file:
    intersection = file.read().splitlines()


In [5]:
# clean dataset
df = clean_dataset(df, intersection)

In [6]:
df['final_host'] = df['final_url'].parallel_apply(lambda x: get_ps1_or_host(x))

In [7]:
df.referrerpolicy.value_counts()

referrerpolicy
no-referrer                        5361
no-referrer-when-downgrade         4226
unsafe-url                         3487
origin                             3078
strict-origin-when-cross-origin    1326
same-origin                         733
origin-when-cross-origin            174
strict-origin                        23
noreferrer                            3
Name: count, dtype: int64

In [8]:
all_el_attrs = df[(df.rel == 'noreferrer')
                       | (df.referrerpolicy.notnull()) ]

In [9]:
all_el_attrs.final_host.nunique()

3670

## referrerpolicy

In [10]:
refpol_df = df.drop_duplicates(subset=['final_host', 'referrerpolicy', 'tagName'])

In [11]:
refpol_df.tagName.value_counts()

tagName
script    28243
link      27761
a         27415
img       27211
iframe    18750
area        222
Name: count, dtype: int64

In [12]:
# Websites with referrerpolicy
refpol_df[refpol_df.referrerpolicy.notna()].final_host.nunique()

2475

In [13]:
# For Table 8

refpol_counts = refpol_df.groupby(['tagName', 'referrerpolicy']).size().reset_index(name='counts')
print(refpol_counts)

   tagName                   referrerpolicy  counts
0        a                      no-referrer       4
1        a       no-referrer-when-downgrade      21
2        a                           origin      96
3        a                      same-origin       1
4        a  strict-origin-when-cross-origin       1
5        a                       unsafe-url       2
6   iframe                      no-referrer      55
7   iframe       no-referrer-when-downgrade     318
8   iframe                       noreferrer       1
9   iframe                           origin     125
10  iframe         origin-when-cross-origin       2
11  iframe                      same-origin       4
12  iframe                    strict-origin       5
13  iframe  strict-origin-when-cross-origin      10
14  iframe                       unsafe-url     537
15     img                      no-referrer      35
16     img       no-referrer-when-downgrade      48
17     img                           origin     104
18     img  

In [14]:
# Check for iframes
refpol_iframe_df = refpol_df[refpol_df.tagName == 'iframe']

In [15]:
refpol_iframe_df

Unnamed: 0,init_url,final_url,failed_visit,href,src,rel,referrerpolicy,title,text,frameUrl,tagName,isFrame,type,crossorigin,as_attr,integrity,final_host
133,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,,https://www.google.com/recaptcha/api2/anchor?a...,,,reCAPTCHA,,https://hippocampus.org/#1_7,iframe,False,,,,,hippocampus.org
473,https://acom.us/industries/agricultural,https://acom.us/industries/agricultural/,False,,about:blank,,,This iframe contains the logic required to han...,,https://acom.us/industries/agricultural/,iframe,False,,,,,acom.us
757,https://www.motoworld.vn/protections,https://www.motoworld.vn/protections,False,,about:blank,,,chat widget,,https://www.motoworld.vn/protections,iframe,False,,,,,motoworld.vn
1014,https://www.flashrouters.com/blog,https://blog.flashrouters.com/,False,,about:blank,,,Lucky Orange,,https://blog.flashrouters.com/,iframe,False,,,,,flashrouters.com
1176,https://www.elkon.net/products/bespoke-and-pre...,https://www.elkon.net/products/bespoke-and-pre...,False,,https://www.google.com/recaptcha/api2/anchor?a...,,,reCAPTCHA,,https://www.elkon.net/products/bespoke-and-pre...,iframe,False,,,,,elkon.net
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39147871,https://www.npu.cz/cs/pamatkova-pece,https://www.npu.cz/cs/pamatkova-pece,False,,https://w.soundcloud.com/player/?url=https%3A/...,,,,,https://www.npu.cz/cs/pamatkova-pece,iframe,False,,,,,npu.cz
39212671,http://sggs.ac.in/home/page/aicte-idea-lab-sgg...,http://sggs.ac.in/home/page/aicte-idea-lab-sgg...,False,,https://www.youtube.com/embed/5jX2OVxExGI,,,YouTube video player,,http://sggs.ac.in/home/page/aicte-idea-lab-sgg...,iframe,False,,,,,sggs.ac.in
39226924,https://www.juanmerodio.com/asesoria-web3-bloc...,https://www.juanmerodio.com/asesoria-web3-bloc...,False,,about:blank,,,"Emprendedor, Empresario… La Web3 es una realidad",,https://www.juanmerodio.com/asesoria-web3-bloc...,iframe,False,,,,,juanmerodio.com
39237797,http://komatsuzawa.co.jp/access/access.html,http://komatsuzawa.co.jp/access/access.html,False,,http://maps.google.co.jp/maps?f=q&source=s_q&h...,,,,,http://komatsuzawa.co.jp/access/access.html,iframe,False,,,,,komatsuzawa.co.jp


In [16]:
# refpol_iframe_df = refpol_iframe_df[refpol_iframe_df.src.str.startswith('http')]

In [17]:
refpol_iframe_unsafe = refpol_iframe_df[refpol_iframe_df.referrerpolicy == 'unsafe-url']
refpol_iframe_unsafe['src_host'] = refpol_iframe_unsafe['src'].parallel_apply(lambda x: get_ps1_or_host(x))
refpol_iframe_unsafe.final_host.nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refpol_iframe_unsafe['src_host'] = refpol_iframe_unsafe['src'].parallel_apply(lambda x: get_ps1_or_host(x))


537

In [18]:
iframe_unique_unsafe = refpol_iframe_unsafe.groupby('src_host')['final_host'].nunique().reset_index()
iframe_unique_unsafe

Unnamed: 0,src_host,final_host
0,,8
1,33across.com,113
2,coupang.com,2
3,kollus.com,1
4,nperf.com,1
5,pubmine.com,6
6,rubiconproject.com,405
7,videoscdn.online,1


In [19]:
refpol_iframe_nrwd = refpol_iframe_df[refpol_iframe_df.referrerpolicy == 'no-referrer-when-downgrade']
refpol_iframe_nrwd['src_host'] = refpol_iframe_nrwd['src'].parallel_apply(lambda x: get_ps1_or_host(x))
refpol_iframe_nrwd.final_host.nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refpol_iframe_nrwd['src_host'] = refpol_iframe_nrwd['src'].parallel_apply(lambda x: get_ps1_or_host(x))


318

In [20]:
iframe_unique_nrwd = refpol_iframe_nrwd.groupby('src_host')['final_host'].nunique().reset_index()
iframe_unique_nrwd

Unnamed: 0,src_host,final_host
0,,10
1,adriver.ru,1
2,autodriven.com,2
3,creativecdn.com,13
4,dailymotion.com,9
5,data,4
6,directly.com,1
7,enquirytracker.net,1
8,getinform.net,1
9,glassix.com,3


## rel

In [21]:
rel_df = df[df.rel == 'noreferrer']
rel_df.final_host.nunique()

1345

In [22]:
# Get rel link attribute
rel_df = rel_df.drop_duplicates(subset=['final_host', 'tagName'])
rel_df.shape

(1352, 17)

In [23]:
# For Table 7
rel_df[(rel_df.rel == "noreferrer" )].tagName.value_counts()

tagName
a         1342
img          7
area         2
script       1
Name: count, dtype: int64