In [1]:
# Re-importing the necessary libraries
import sys
sys.path.append('..')

import ujson as json
import pandas as pd
import glob
from os.path import basename
import re
from helpers.utils import is_failed_visit

In [2]:
def get_all_reqs(path):
    """
    Extracts request information from the JSON files in the given path.
    
    Args:
    path (str): The path to the JSON files containing the request data.
    
    Returns:
    reqDF (pd.DataFrame): A DataFrame containing the request information.
    """

    attr_list = []
    meta_list = []
    # dir_path = os.path.dirname(path)

    for json_path in glob.glob(path):
        json_name = basename(json_path)

        try:
            with open(json_path) as file:
                results = json.load(file)
        except Exception as e:
            print("ERROR: Cannot load the json", json_name, e)
            continue

        # Get visit metadata and check whether it should be processed
        try:
            init_url = results["initialUrl"]
            final_url = results["finalUrl"]

            failed_visit = is_failed_visit(results)
        except Exception as err:
            continue
        try:
            linkAttributes = results["data"]["elementAttributes"]
            for link in linkAttributes["elmAttrs"]:
                # print("--site_domain--", site_domain)
                href = link['href']
                src = link['src']
                rel = link['rel']
                referrerpolicy = link['linkReferrerPolicy']
                title = link['title']
                text = link['text']
                frameUrl = link['frameUrl']
                tagName = link['tagName']
                isFrame = link['inFrame']
                type = link['type']
                crossorigin = link['crossorigin']
                as_attr = link['as']
                integrity = link['integrity']
                link_details = (init_url, final_url ,failed_visit ,href, src, rel, referrerpolicy, 
                                title, text, frameUrl, tagName, isFrame, type, 
                                crossorigin, as_attr, integrity)
                attr_list.append(link_details)
        except Exception as err:
            print('Error elemAttrb', init_url, err)
        try:
            for meta in linkAttributes["metaRP"]:
                referrer_policy = meta["metaReferrerContent"]
                # print(referrer_policy)
                frameUrl = meta['frameUrl']
                inFrame = meta['inFrame']
                pageURL = meta['pageUrl']
                # print (pageURL)
                metadata_details = (init_url, final_url ,failed_visit ,referrer_policy, frameUrl, inFrame, pageURL)
                meta_list.append(metadata_details)
        except Exception as err:
            print('Error meta', init_url, err)
        
            
    
    attr_df = pd.DataFrame(attr_list, columns=[
        'init_url', 'final_url','failed_visit', 'href', 'src', 'rel', 
        'referrerpolicy', 'title', 'text', 
        'frameUrl', 'tagName', 'isFrame', 'type', 'crossorigin', 'as_attr', 'integrity'])
    # attrDF.to_json(saved_req_file_path.replace('requests', 'linkAttrs'), orient='records')
    
    meta_df = pd.DataFrame(meta_list, columns=[
        'init_url', 'final_url' ,'failed_visit' ,'referrer_policy', 'frameUrl', 'inFrame', 'pageURL'])


    return attr_df, meta_df


In [3]:
# folder path
# folder_path = '/data/referrer-policy/Small_scale_crawl/2025-02-08_noAct/*.json'
folder_path = '/data/referrer-policy/SF/2024-01-24_inner_collector_SF/*.json'


pattern_type = r'(\d{4}-\d{2}-\d{2})_([^/]+)'
match_type = re.search(pattern_type, folder_path)

if match_type:
    extract_date, extract_type = match_type.groups()
    if "inner_collector" in extract_type:
        extract_type = extract_type.replace("inner_collector_", "")
    print("Date:", extract_date)
    print("Type:", extract_type)
else:
    print("No match found.")


Date: 2024-01-24
Type: SF


In [4]:
df_attr, df_meta = get_all_reqs(folder_path)

Error elemAttrb https://www.vaned.com/index.cfm/fa/ce/fa2/home 'text'
Error elemAttrb http://rss.infodsi.com/ 'text'
Error elemAttrb https://www.villarddelans-correnconenvercors.com/#tabs-65a9c8a9de58d 'text'
Error elemAttrb https://clasificadosonline.com/empleos 'text'
Error elemAttrb https://masterbuilders.com.au/campaigns 'text'
Error elemAttrb https://skateboardshopsunabe.com/?mode=cate&cbid=139613&csid=2&sort=n 'text'
Error elemAttrb https://study.wearefamilyfoundation.org/ 'text'
Error elemAttrb https://apogee.net/solutions/low-and-moderate-income 'text'
Error elemAttrb https://www.myresortnetwork.com/search-vacation-rentals/index.asp 'text'
Error elemAttrb https://www.tourisme-saintomer.com/#pagesection-0 'text'
Error elemAttrb https://www.green-acres.gr/%ce%b1%ce%ba%ce%af%ce%bd%ce%b7%cf%84%ce%b1/%ce%b9%ce%bf%ce%bd%ce%b9%ce%b1-%ce%bd%ce%b7%cf%83%ce%b9%ce%b1 'text'
Error elemAttrb https://www.simplyscratch.com/feed 'text'
Error elemAttrb https://stxaviersschool.com/origin 'text'


In [5]:
df_attr.head()

Unnamed: 0,init_url,final_url,failed_visit,href,src,rel,referrerpolicy,title,text,frameUrl,tagName,isFrame,type,crossorigin,as_attr,integrity
0,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,/HippoCampus/style/hippoUniversal.css;jsession...,,stylesheet,,,,https://hippocampus.org/#1_7,link,False,text/css,,,
1,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,/HippoCampus/style/twoPanelNavigator.css;jsess...,,stylesheet,,,,https://hippocampus.org/#1_7,link,False,text/css,,,
2,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,/HippoCampus/style/welcomeBox.css;jsessionid=5...,,stylesheet,,,,https://hippocampus.org/#1_7,link,False,text/css,,,
3,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,/HippoCampus/style/collectionsMenu.css;jsessio...,,stylesheet,,,,https://hippocampus.org/#1_7,link,False,text/css,,,
4,https://hippocampus.org/#1_7,https://hippocampus.org/#1_7,False,/HippoCampus/style/standardPopup.css;jsessioni...,,stylesheet,,,,https://hippocampus.org/#1_7,link,False,text/css,,,


In [6]:
df_meta

Unnamed: 0,init_url,final_url,failed_visit,referrer_policy,frameUrl,inFrame,pageURL
0,https://souffle.life/manga/ohayou-oyasumi-mata...,https://souffle.life/manga/ohayou-oyasumi-mata...,False,origin-when-crossorigin,https://www.facebook.com/v3.2/plugins/share_bu...,True,https://souffle.life/manga/ohayou-oyasumi-mata...
1,https://souffle.life/manga/ohayou-oyasumi-mata...,https://souffle.life/manga/ohayou-oyasumi-mata...,False,origin-when-crossorigin,https://www.facebook.com/v3.2/plugins/share_bu...,True,https://souffle.life/manga/ohayou-oyasumi-mata...
2,https://www.1800tequila.com/our-story,https://www.1800tequila.com/our-story/,False,origin,https://pay.google.com/gp/p/ui/payframe?origin...,True,https://www.1800tequila.com/our-story/
3,https://xxxboard.net/member.php?30-flash&s=bfa...,https://xxxboard.net/member.php?30-flash&s=bfa...,False,unsafe-url,https://xxxboard.net/member.php?30-flash&s=bfa...,False,https://xxxboard.net/member.php?30-flash&s=bfa...
4,https://uiowa.edu/academics/areas-study,https://uiowa.edu/academics/areas-study,False,no-referrer-when-downgrade,https://uiowa.edu/academics/areas-study,False,https://uiowa.edu/academics/areas-study
...,...,...,...,...,...,...,...
16219,https://www.gourmetgiftbaskets.com/corporate-g...,https://www.gourmetgiftbaskets.com/corporate-g...,False,origin,https://www.google.com/shopping/customerreview...,True,https://www.gourmetgiftbaskets.com/corporate-g...
16220,https://www.techfinehelp.in/2022/05/instagram-...,https://www.techfinehelp.in/2022/05/instagram-...,False,origin,https://www.blogger.com/comment/frame/67039747...,True,https://www.techfinehelp.in/2022/05/instagram-...
16221,http://povarixa.ru/world-cook.html,http://povarixa.ru/world-cook.html,False,origin,http://povarixa.ru/world-cook.html,False,http://povarixa.ru/world-cook.html
16222,https://www.ruralheritage.com/new_rh_website/c...,https://www.ruralheritage.com/new_rh_website/c...,False,origin-when-crossorigin,https://www.facebook.com/plugins/like.php?href...,True,https://www.ruralheritage.com/new_rh_website/c...


In [7]:
df_attr.to_pickle("../output/data_raw/"+ extract_date + '_' + extract_type +'_attr.pkl')

In [8]:
df_meta.to_parquet("../output/data_raw/"+ extract_date + '_' + extract_type +'_meta.parquet')