In [None]:
import os
import json
from pathlib import Path
import csv

In [26]:
from pathlib import Path
from transformers import MarkupLMFeatureExtractor
import csv


def load_website_labels(dataset_dir, website_name, attributes):
    labels = {}
    labels_path = Path(dataset_dir) / "camera_labels"
    
    for attribute in attributes:
        filepath = labels_path / f"camera-{website_name}-{attribute}.txt"
        with open(filepath, newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter='	', quotechar='|')
            for row in list(reader)[2:]:
                if row[0] not in labels:
                    labels[row[0]] = {}
                labels[row[0]][attribute] = row[2]
    return labels

def load_website_data(dataset_dir, website_name, page_index):
    pages_path = Path(dataset_dir) / "camera_pages"
    html_file_path = pages_path / f"camera-{website_name}" / f"{page_index}.htm"
    with open(html_file_path) as f:
        html_code = f.read()
        return html_code


def load_camera_dataset(dataset_dir):
    attributes = ["model", "price", "manufacturer"]
    id2label = {0: "model", 1: "price", 2: "manufacturer", 3: "other"}
    label2id = {label:id for id, label in id2label.items()}
    website_names = [f.name[7:] for f in (Path(dataset_dir) / "camera_pages").iterdir() if f.is_dir()]
    feature_extractor = MarkupLMFeatureExtractor()

    dataset = {}
    for website in website_names:
        print(website)
        true_labels = load_website_labels(dataset_dir, website, attributes) 

        data = []
        for k, v in true_labels.items():
            page_index = k
            annotations = v
            raw_html = load_website_data(dataset_dir, website, page_index)
            encoding = feature_extractor(raw_html)
            node_labels = [[]]

            for node_text in encoding['nodes'][0]:
                if node_text == annotations['model']:
                    node_labels[0].append(label2id['model'])
                elif node_text == annotations['price']:
                    node_labels[0].append(label2id['price'])
                elif node_text == annotations['manufacturer']:
                    node_labels[0].append(label2id['manufacturer'])
                else:
                    node_labels[0].append(label2id['other'])
            
            data.append({
                'nodes': encoding['nodes'],
                'xpaths': encoding['xpaths'],
                'node_labels': node_labels
            })

        dataset[website] = data
    return dataset

In [27]:
dataset = load_camera_dataset("/home/savkin/vera/to_server/camera_dataset")

buy
beachaudio
amazon
pcnation
thenerds
onsale
compsource
ecost
jr
newegg


In [28]:
dataset["buy"][0]

{'nodes': [['document.domain = "buy.com";',
   '//<![CDATA[\n\tif ((/[?&]ic=1/i).test(window.location.search))\n\t\tdocument.cookie = "classicsticky=1; path=/; domain=buy.com;";\n\n\tif (!(/classicsticky=1/).test(document.cookie)) {\n\t\tvar uaCheck = navigator.userAgent.toLowerCase();\n\t\tvar pgCheck = location.href.toLowerCase();\n\n\t\tif ((uaCheck.indexOf("iphone") > -1) || (uaCheck.indexOf("ipod") > -1) || (uaCheck.indexOf("android") > -1) || (uaCheck.indexOf("blackberry") > -1) || (uaCheck.indexOf("midp") > -1) || (uaCheck.indexOf("cldc") > -1) || (uaCheck.indexOf("windows ce") > -1)) {\n\t\t\tif (pgCheck.indexOf("usersearchresults") > -1 || pgCheck.indexOf("/umerch/") > -1) {\n\t\t\t\tif (pgCheck.indexOf("qu=") > -1) {\n\t\t\t\t    window.location = \'http://mobile.buy.com/ibuy/SearchProducts.aspx?pg=0&s=\';\n\t\t\t\t} else {\n\t\t\t\t\twindow.location = \'http://mobile.buy.com/ibuy/\';\n\t\t\t\t}\n\t\t\t} else if (pgCheck.indexOf("/prod/") > -1 || pgCheck.indexOf("product.asp"

In [29]:
import json

with open("/home/savkin/vera/camera_dataset.json", "w") as file:
    json.dump(dataset, file, indent=4)

In [30]:
with open("/home/savkin/vera/camera_dataset.json", "r") as file:
    json.load(file)

In [42]:
import numpy as np 

def sample_websites(dataset, k, seed):
    all_websites = list(dataset.keys())
    training_websites = np.random.choice(all_websites, size=k, replace=False)

    train = []
    valid = []
    for website in all_websites:
        if website in training_websites:
            train += dataset[website]
        else:
            valid += dataset[website]
    
    return train, valid

In [43]:
# import json

# for seed in range(10):
#     for k in [1,2,3,4,5]:
#         train, valid = sample_websites(dataset, k=k, seed=seed)
        
#         save_path = Path("camera_dataset") / f"k_{k}" /f"seed_{seed}"
#         with open(save_path, "s")
#         json.dump()

valid
valid
valid
valid
train
valid
valid
valid
valid
valid
309 4949
train
valid
valid
train
valid
valid
valid
valid
valid
valid
734 4524
valid
valid
train
train
valid
valid
valid
valid
train
valid
2368 2890
valid
train
train
valid
valid
valid
valid
valid
train
train
2601 2657
train
valid
train
valid
train
train
valid
train
valid
valid
3760 1498
