In [1]:
import glob
import re
from collections import Counter
import json
from subprocess import Popen, PIPE
import os
import plotly.graph_objects as go

In [None]:
# zlint_result_path = "/mnt/6TB/dockerMLcerts/attached_dir/zlintResults"

# zlint_results = {}
# for f in reversed(glob.glob(zlint_result_path+"/*")):
    
# #     if "zmapOriginalCerts" not in f:
# #         continue
    
#     zlint_results[f] = {}

#     print(f)
        
#     for ff in glob.glob(f + "/*"):
#         zlint_results[f][ff] = []
        
#         with open(ff, "r") as fileO:
#             for l in fileO.readlines():
#                 for x in l.split("time="):
#                     if x.startswith('{"raw":'):
#                         zlint_results[f][ff].append(x)   


import glob
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

zlint_result_path = "/mnt/6TB/dockerMLcerts/attached_dir/zlintResults"
zlint_results = {}

# Get all top-level directories
top_level_dirs = list(reversed(glob.glob(zlint_result_path + "/*")))

def process_file(file_path):
    results = []
    try:
        with open(file_path, "r") as fileO:
            for l in fileO:
                for x in l.split("time="):
                    if x.startswith('{"raw":'):
                        results.append(x)
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")
    return file_path, results

def process_dir(f):
    sub_results = {}
    file_list = glob.glob(f + "/*")
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_file, ff): ff for ff in file_list}
        for future in tqdm(as_completed(futures), total=len(file_list), desc=os.path.basename(f)):
            ff, results = future.result()
            sub_results[ff] = results
    return f, sub_results

with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_dir, f): f for f in top_level_dirs}
    for future in tqdm(as_completed(futures), total=len(top_level_dirs), desc="Overall Progress"):
        f, results = future.result()
        zlint_results[f] = results

Overall Progress:   0%|                                                              | 0/4 [00:00<?, ?it/s]
transcert-30k:   0%|                                                             | 0/30000 [00:00<?, ?it/s][A
transcert-30k:   4%|█▋                                             | 1064/30000 [00:00<00:02, 10531.02it/s][A
transcert-30k:   7%|███▍                                            | 2118/30000 [00:00<00:03, 7035.78it/s][A
transcert-30k:  10%|████▌                                           | 2887/30000 [00:00<00:04, 6690.46it/s][A
transcert-30k:  12%|█████▋                                          | 3588/30000 [00:00<00:04, 6422.41it/s][A
transcert-30k:  14%|██████▊                                         | 4247/30000 [00:00<00:04, 6374.94it/s][A
transcert-30k:  16%|███████▊                                        | 4895/30000 [00:00<00:04, 6144.54it/s][A
transcert-30k:  18%|████████▊                                       | 5515/30000 [00:00<00:04, 6109.41it/s][A
tran

In [3]:
for ds in zlint_results:
    print(ds, len(zlint_results[ds]))

/mnt/6TB/dockerMLcerts/attached_dir/zlintResults/transcert-30k 30000
/mnt/6TB/dockerMLcerts/attached_dir/zlintResults/zmapOriginalCerts 100000
/mnt/6TB/dockerMLcerts/attached_dir/zlintResults/zmap-data-1024-3-0.0002lr-0.1dropout-epoch3-step300000T1.5 591466
/mnt/6TB/dockerMLcerts/attached_dir/zlintResults/frankencerts-8M 8000000


In [6]:
import collections
import json
import random

errorsPresent = {}
for dataset in zlint_results:
    cert_info_lines = set()
    print(dataset, len(zlint_results[dataset]))
    errorsPresent[dataset] = set()
    
    for flines in zlint_results[dataset]:
        cert_info_lines.add(len(zlint_results[dataset][flines]))
        ### Frankencerts might have a longer chain, need to do a fair analysis
        for line in random.sample(zlint_results[dataset][flines], len(zlint_results[dataset][flines]))[:1]:
            try:
                l = json.loads(line)
            except Exception as e:
                print("ERROR", flines)
            
            if l['zlint']['errors_present'] == True:
                for lint in l['zlint']['lints']:
                    if l['zlint']['lints'][lint]['result'] == 'error':
                        #print(lint, l['zlint']['lints'][lint])
                        errorsPresent[dataset].add(lint)
    print(cert_info_lines)

/mnt/6TB/dockerMLcerts/attached_dir/zlintResults/transcert-30k 30000
{0, 1, 2, 3, 4}
/mnt/6TB/dockerMLcerts/attached_dir/zlintResults/zmapOriginalCerts 100000
{0, 1}
/mnt/6TB/dockerMLcerts/attached_dir/zlintResults/zmap-data-1024-3-0.0002lr-0.1dropout-epoch3-step300000T1.5 591466
{0, 1}
/mnt/6TB/dockerMLcerts/attached_dir/zlintResults/frankencerts-8M 8000000
{0, 1, 2}


In [None]:
from matplotlib_venn import venn3
from matplotlib_venn import venn2
import matplotlib.pyplot as plt

plt.figure(figsize=(5,5))
plt.rcParams['figure.dpi'] = 300

for d in errorsPresent:
    print(d, len(set(errorsPresent[d])))

errorSetML = set(errorsPresent[zlint_result_path + "/zmap-data-1024-3-0.0002lr-0.1dropout-epoch3-step300000T1.5"])
errorSetFranken = set(errorsPresent[zlint_result_path + "/frankencerts-8M"])
errorSetTran = set(errorsPresent[zlint_result_path + "/transcert-30k"])

venn3([errorSetML, errorSetFranken, errorSetTran], ('MLcerts (errors = 146)', 'Frankencerts (errors = 118)', 'Transcert (errors = 89)'))
#venn2([errorSetML, errorSetFranken], ('MLcerts (146)', 'Frankencerts (118)'))

print(errorSetFranken)
print(errorSetTran)

#plt.title('Unique zlint errors triggered by different datasets')
plt.savefig('./Figures/zlintErrorsVenn.pdf', bbox_inches='tight')
#plt.show()

In [None]:
print(len(errorSetML.difference(errorSetFranken)))