# Circumvention

Check RP circumvention on three vetors

In [1]:
import sys
sys.path.append('..')

import pandas as pd
from pandarallel import pandarallel
import os

from helpers.utils import (
    check_urlparse,
    check_circumvention,
    validate_referrer,
    create_summary,
    mark_fp_rows,
    clean_column_detail
)

In [2]:
#folder_path = '../output/data_leaks/2025-02-08_noAct_leaks.parquet'
# folder_path = '../output/data_leaks/2024-01-24_SF_leaks.parquet'
# folder_path = '../output/data_leaks/2024-01-24_SG_leaks.parquet'
folder_path = '../output/data_leaks/2024-01-24_AMS_leaks.parquet'

df = pd.read_parquet(folder_path)


# parquet_file = './output/2025-02-08_noAct_extended_leaks_raw.parquet'
# parquet_file = './output/2025-02-08_optIn_extended_leaks_raw.parquet'
# parquet_file = './output/2025-02-08_optOut_extended_leaks_raw.parquet'

import re
match = re.search(r'(\d{4}-\d{2}-\d{2})_([^_]+)', folder_path)
if match:
    extract_date, extract_type = match.groups()
    print("Date:", extract_date)
    print("Type:", extract_type)

Date: 2024-01-24
Type: AMS


In [3]:
pandarallel.initialize()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
df.shape

(6433459, 22)

In [5]:
# Cleaning the data
df = df[df['failed_visit'] == False]
df = df[df['req_url'].str.startswith('http')]
df = df[df.final_url != "about:blank"]

In [6]:
df.shape

(6336437, 22)

In [7]:
# Extracting the URL components for referrer vector

df[['ref_final_found', 'ref_full', 'ref_netloc', 'ref_hostname', 'ref_path', 
                'ref_params', 'ref_query', 'ref_fragments']] = df.parallel_apply(
                    lambda row: check_urlparse(row, "ref"), axis=1, result_type="expand")


In [8]:
# Checking for circumvention for referrer vector
df[['ref_flag', 'ref_frag_found']] = df.parallel_apply(
    lambda row: check_circumvention(row, "ref"), axis=1, result_type='expand')

In [9]:
# Omitting false positive in referrer vector
df = validate_referrer(df)

In [10]:
df['ref_summary'] = df.parallel_apply(lambda row: create_summary(row, 'ref'), axis=1)

In [11]:
# Extracting the URL components for request URL vector
df[['url_final_found', 'url_full', 'url_netloc', 'url_hostname', 'url_path', 
                   'url_params', 'url_query', 'url_fragments']] = df.parallel_apply(
                       lambda row: check_urlparse(row, "url"), axis=1, result_type="expand")

# Checking for circumvention for request URL vector
df[['url_flag', 'url_frag_found']] = df.parallel_apply(
    lambda row: check_circumvention(row, "url"), axis=1, result_type='expand')

# Omitting false positive in request URL vector
df = mark_fp_rows(df, "url")

# Creating the summary for request URL vector
df['url_summary'] = df.parallel_apply(lambda row: create_summary(row, 'url'), axis=1)


In [12]:
# Extracting the URL components for post vector
df[['post_final_found', 'post_full', 'post_netloc', 'post_hostname', 'post_path', 
                   'post_params', 'post_query', 'post_fragments']] = df.parallel_apply(
                       lambda row: check_urlparse(row, "post"), axis=1, result_type="expand")

# Checking for circumvention for post vector
df[['post_flag', 'post_frag_found']] = df.parallel_apply(
    lambda row: check_circumvention(row, "post"), axis=1, result_type='expand')

# Omitting false positive in post vector
df = mark_fp_rows(df, "post")

# Creating the summary for post vector
df['post_summary'] = df.parallel_apply(lambda row: create_summary(row, 'post'), axis=1)


In [13]:
# df = clean_column_detail(df)

In [14]:
os.makedirs("../output/circum", exist_ok=True)

In [15]:
df.to_parquet("../output/circum/"+extract_date+"_"+extract_type+'_circum.parquet')