In [1]:
import os, sys
import json
import re
import time
import pandas as pd
from multiprocessing import Pool
from tqdm import tqdm

import tfidf_searching
from clean_text import cleaned_text

In [3]:
with open('/home/chentianyu/dependency/dataset/train_reduced.json', 'r') as f:
    trains = json.load(f)
with open('/home/chentianyu/dependency/dataset/test_reduced.json', 'r') as f:
    tests = json.load(f)

In [4]:
oskg_folder = '/home/chentianyu/dependency/oskg/'
node_path = os.path.join(oskg_folder, 'oskg_node2os_20231120163318.csv')
edge_path = os.path.join(oskg_folder, 'oskg_edges_20231120163318.csv')
pros_path = os.path.join(oskg_folder, 'oskg_pros_20231120163318.csv')

nodes = pd.read_csv(node_path, delimiter='\t', header=None)
edges = pd.read_csv(edge_path, delimiter='\t', header=None)
pros = pd.read_csv(pros_path, delimiter='\t')

  pros = pd.read_csv(pros_path, delimiter='\t')


In [144]:
pros = pros[pros.apply(lambda x: type(x['name']) == str, axis=1)]
pros_names = set([name.lower() for name in pros.name.to_list() if type(name) == str])

In [129]:
def get_c_artifact(lib):
    complete_name = ':'.join(lib.split(':')[1:])
    artifact = complete_name.split('/')[-1]
    return artifact.lower()

In [33]:
len(trains), len(tests)

(2062, 230)

In [130]:
vulns = trains + tests
vuln_labels = [get_c_artifact(vuln['label']) for vuln in vulns]

In [145]:
sum([lib in pros_names for lib in vuln_labels[:-230]]),\
sum([lib in pros_names for lib in vuln_labels[-230:]]), len(vulns)

(1302, 142, 2292)

In [131]:
pros.columns

Index(['index_id', 'type', 'pkgKey', 'pkgId', 'name', 'arch', 'version',
       'epoch', 'release', 'summary', 'description', 'url', 'time_file',
       'time_build', 'rpm_license', 'rpm_vendor', 'rpm_group', 'rpm_buildhost',
       'rpm_sourcerpm', 'rpm_header_start', 'rpm_header_end', 'rpm_packager',
       'size_package', 'size_installed', 'size_archive', 'location_href',
       'location_base', 'checksum_type', 'src_name', 'src_version'],
      dtype='object')

In [123]:
name_weight = 4
pros_corpus = pros.drop_duplicates('name')[['name', 'summary']]
pros_corpus.columns = ['object', 'token']
pros_corpus.object = pros_corpus.object.apply(lambda x: x.lower())
pros_corpus.token = pros_corpus.token.apply(lambda x: x if type(x) == str else ' ')
pros_mapping = pros_corpus.set_index('object').to_dict()
pros_corpus.token = pros_corpus.apply(\
                    lambda x: f"{x['object'] * name_weight} {x['token']}", axis=1)

pros_corpus.token = pros_corpus.token.apply(lambda x: cleaned_text(x))
pros_corpus.token = pros_corpus.token.apply(lambda x: ' '.join(x))

  1%|▏         | 30/2292 [21:21<26:50:03, 42.71s/it]


In [153]:
def recall(vuln, search_result, k=128):
    artifact = get_c_artifact(vuln['raw_label'])
    return artifact in search_result[:k]

In [93]:
search_engine = tfidf_searching.TfidfSearching(pros_corpus, 512, 2)

In [117]:
def fun(vuln):
    search_engine = tfidf_searching.TfidfSearching(pros_corpus, 1024, 2)
    return search_engine.search_topk_objects(cleaned_text(vuln['desc']), [])

In [None]:
with Pool(processes=64) as pool:
    tf_idf_res = list(tqdm(pool.imap(fun, vulns)))

In [154]:
for k in [128, 256, 512, 1024]:
    recalls = [recall(vuln, res, k) for vuln, res in zip(vulns, tf_idf_res)\
               if get_c_artifact(vuln['raw_label']) in pros_names]
    print(f'k={k}:', sum(recalls), len(recalls))

k=128: 399 1444
k=256: 482 1444
k=512: 588 1444
k=1024: 686 1444


In [135]:
for vuln, res in zip(vulns, tf_idf_res):
    vuln['top_k'] = [{'lib_name': lib, 'website_description':\
                      pros_mapping['token'][lib]} for lib in res]
    vuln['raw_label'] = vuln['label']
    vuln['labels'] = get_c_artifact(vuln['label'])
    del vuln['label']

In [148]:
for idx, vuln in enumerate(vulns):
    vuln['labels'] = [vuln['labels']]

In [149]:
output_dir = '/home/chentianyu/dependency/inputs/'
train_path = os.path.join(output_dir, 'train.json')
valid_path = os.path.join(output_dir, 'valid.json')
test_path = os.path.join(output_dir, 'test.json')

new_train = [vuln for vuln, label in zip(trains, vuln_labels[:-230]) if label in pros_names]
new_test = [vuln for vuln, label in zip(tests, vuln_labels[-230:]) if label in pros_names]

with open(train_path, 'w') as f:
    json.dump(new_train, f)

with open(valid_path, 'w') as f:
    json.dump(new_test, f)

with open(test_path, 'w') as f:
    json.dump(new_test, f)

In [143]:
len(new_test)

136

In [89]:
'ntp' in search_engine.search_topk_objects(cleaned_text(trains[0]['desc']), [])

True