In [1]:
import sys
sys.path.append('..')

import os
import re
import glob
import ujson as json
import pandas as pd
from os.path import basename
from pandarallel import pandarallel

from helpers.utils import (
    get_response_referrer_policy, 
    is_failed_visit,
    get_ps1_or_host,
    match_entity,
    is_same_entity
)


## Open Folder and extract JSON file

In [2]:
# folder path
# folder_path = '/data/referrer-policy/Small_scale_crawl/2025-02-08_noAct/*.json'
# folder_path = '/data/referrer-policy/SF/2024-01-24_inner_collector_SF/*.json'
# folder_path = '/data/referrer-policy/AMS/2024-01-24_inner_collector_AMS/*.json'
folder_path = '/data/referrer-policy/SG/2024-01-24_inner_collector_SG/*.json'

pattern_type = r'(\d{4}-\d{2}-\d{2})_([^/]+)'
match_type = re.search(pattern_type, folder_path)

if match_type:
    extract_date, extract_type = match_type.groups()
    if "inner_collector" in extract_type:
        extract_type = extract_type.replace("inner_collector_", "")
    print("Date:", extract_date)
    print("Type:", extract_type)
else:
    print("No match found.")


Date: 2024-01-24
Type: SG


In [3]:
extracted_list = []

for json_path in glob.glob(folder_path):
    json_name = basename(json_path)

    try:
        # use utf-8-sig to handle bit order marking in json
        with open(json_path, encoding='utf-8-sig') as file:
            results = json.load(file)
    except Exception as e:
        print("ERROR: Cannot load the json", json_name, e)
        continue
    
    try:
        init_url = results["initialUrl"]
        final_url = results["finalUrl"]
        failed_visit = is_failed_visit(results)
    except Exception as e:
        print("ERROR: Cannot find the url", json_name, e)
        continue
    
    results_data = results["data"]

    for req in results_data ["requests"]:
        req_url = req["url"]
        req_method = req.get("method")
        req_type = (req['type']).lower()
        post_data = req.get("postData")
        ref_pol = req.get('reqReferrerPolicy')
        referrer = req.get('requestHeaders', {}).get('referer')
        status = req.get("status")
        size = req.get("size")
        response_ref_policy = get_response_referrer_policy(req)
        initiators = req.get("initiators")
        
        extracted_details = [
            init_url, final_url, failed_visit,
            req_url, req_type, req_method, size, status, 
            ref_pol , referrer, post_data, response_ref_policy, initiators]
        extracted_list.append(extracted_details)

extractedDF = pd.DataFrame(extracted_list, 
                           columns=[
                               "init_url",
                               "final_url", 
                               "failed_visit",
                               "req_url", 
                               "req_type",
                               "req_method",
                               "size",
                               "status",
                               "ref_pol",
                               "referrer", 
                               "post_data",
                               "response_ref_policy",
                               "initiators"])
extractedDF.head()


ERROR: Cannot find the url sklepanwen.pl_1226.json 'status'
ERROR: Cannot find the url transparentdata.pl_2e89.json 'status'
ERROR: Cannot find the url www.bestinjurylawyerusa.com_74ce.json 'status'
ERROR: Cannot find the url transparentdata.pl_f889.json 'status'
ERROR: Cannot find the url transparentdata.pl_b63c.json 'status'
ERROR: Cannot find the url nombradas.report.cl_114d.json 'status'


Unnamed: 0,init_url,final_url,failed_visit,req_url,req_type,req_method,size,status,ref_pol,referrer,post_data,response_ref_policy,initiators
0,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/,document,GET,21821.0,200.0,strict-origin-when-cross-origin,,,,[]
1,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/HippoCampus/style/hipp...,stylesheet,GET,939.0,200.0,strict-origin-when-cross-origin,https://hippocampus.org/,,,[https://hippocampus.org/]
2,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/HippoCampus/style/twoP...,stylesheet,GET,1126.0,200.0,strict-origin-when-cross-origin,https://hippocampus.org/,,,[https://hippocampus.org/]
3,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/HippoCampus/style/welc...,stylesheet,GET,781.0,200.0,strict-origin-when-cross-origin,https://hippocampus.org/,,,[https://hippocampus.org/]
4,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/HippoCampus/style/coll...,stylesheet,GET,778.0,200.0,strict-origin-when-cross-origin,https://hippocampus.org/,,,[https://hippocampus.org/]


In [4]:
# Create the directory if it doesn't exist
# os.makedirs("../output", exist_ok=True)
os.makedirs("../output/data_raw", exist_ok=True)

In [5]:
extractedDF.head()

Unnamed: 0,init_url,final_url,failed_visit,req_url,req_type,req_method,size,status,ref_pol,referrer,post_data,response_ref_policy,initiators
0,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/,document,GET,21821.0,200.0,strict-origin-when-cross-origin,,,,[]
1,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/HippoCampus/style/hipp...,stylesheet,GET,939.0,200.0,strict-origin-when-cross-origin,https://hippocampus.org/,,,[https://hippocampus.org/]
2,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/HippoCampus/style/twoP...,stylesheet,GET,1126.0,200.0,strict-origin-when-cross-origin,https://hippocampus.org/,,,[https://hippocampus.org/]
3,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/HippoCampus/style/welc...,stylesheet,GET,781.0,200.0,strict-origin-when-cross-origin,https://hippocampus.org/,,,[https://hippocampus.org/]
4,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,https://hippocampus.org/HippoCampus/style/coll...,stylesheet,GET,778.0,200.0,strict-origin-when-cross-origin,https://hippocampus.org/,,,[https://hippocampus.org/]


In [6]:
pandarallel.initialize()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [7]:
df_entity = pd.read_json('../helpers/domain_map.json')

In [8]:
extractedDF['final_host'] = extractedDF.parallel_apply(lambda x: get_ps1_or_host(x['final_url']), axis=1)
extractedDF['final_entity'] = extractedDF.parallel_apply(lambda x: match_entity(df_entity, x['final_host']), axis=1)
extractedDF['req_host'] = extractedDF.parallel_apply(lambda x: get_ps1_or_host(x['req_url']), axis=1)
extractedDF['req_entity'] = extractedDF.parallel_apply(lambda x: match_entity(df_entity, x['req_host']), axis=1)
extractedDF['is_same_entity_finReq'] = extractedDF.parallel_apply(lambda x: is_same_entity(x['final_entity'], x['req_entity']), axis=1)

In [9]:
# save to parquet
extractedDF.to_parquet("../output/data_raw/"+extract_date+"_"+extract_type+'_raw.parquet')