In [None]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [None]:
train_path = "YOUR_TRAIN_PATH"
train_raw = pd.read_csv(train_path, sep='#')

In [None]:
val_path = "YOUR_VAL_PATH"
val_raw = pd.read_csv(val_path, sep='#')
test_path = "YOUR_TEST_PATH"
test_raw = pd.read_csv(test_path, sep='#')

In [None]:
def get_code_elements(cpg):
    elements_map = {}
    elements_map['IDENTIFIER'] = []
    elements_map['CALL'] = []
    elements_map['CONTROL_STRUCTURE'] = []
    if len(str(cpg)) > 100:
        lines = cpg.split("--====--")
        elements = json.loads(lines[0]) 
        for e in elements:
            if '_label' in e:
                label = str(e['_label'])
                if label == "IDENTIFIER":
                    els = elements_map['IDENTIFIER']
                    els.append(e)
                    elements_map['IDENTIFIER'] = els
                elif label == "CALL":
                    els = elements_map['CALL']
                    els.append(e)
                    elements_map['CALL'] = els
                elif label == "CONTROL_STRUCTURE":
                    els = elements_map['CONTROL_STRUCTURE']
                    els.append(e)
                    elements_map['CONTROL_STRUCTURE'] = els
    return elements_map

cpg = test_raw['cpg'][6]
mp = get_code_elements(cpg)

In [None]:
def construct_analyzed_data(df):
    result = pd.DataFrame(columns=[])
    result['code'] = df['text']
    types = ['dos', 'nan', 'info', 'overflow', 'priv', 'mem', 'exec', 'bypass']
    for t in types:
        result[t] = df[t]
    element_col = []
    for i in range(len(df)):
        cpg = df['cpg'][i]
        element_map = get_code_elements(cpg)
        element_col.append(element_map)    
    result['elements'] = element_col
    return result

In [None]:
def preprocess(df):
  removed_cols = ['CVE Page', 'Summary', 'Vulnerability Classification',
                  'codeLink', 'commit_id', 'commit_message', 'del_lines', 'file_name', 'func_before',
 'vul_func_with_fix', 'flaw_line', 'flaw_line_index', 'vul_func_with_fix', 'processed_func',
 'sql', 'r.spl.', 'dir.', 'trav.', 'http', 'xss', 'corr.']
  
  try:
    df = df.rename({'func_before' : 'text', '+info': 'info', '+priv': 'priv', 'mem.' : 'mem'}, axis=1, inplace=False)
  except:
      print("RENAMED")

  for c in removed_cols:
    try:
      df = df.drop(c, axis=1)
    except:
      print(c)
  df = df.drop('code', axis=1)
  return df

In [None]:
train_df = preprocess(train_raw)
test_df = preprocess(test_raw)
val_df = preprocess(val_raw)
LABEL_COLUMNS = train_df.columns.tolist()[2:]

In [None]:
test = construct_analyzed_data(test_df)

In [None]:
train = construct_analyzed_data(train_df)

# Tokenizer

In [None]:
import re

def code_tokenizer(identifier):
    subtokens = set()
    parts = filter(None, re.split("[, \-!?:_~]+", identifier))
    for part in parts:
        if not part.isdigit():
            splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', part)).split()
            lower_tokens = [re.sub("[^a-zA-Z]+", "", item.lower()) for item in splitted]
            subtokens.update(set(lower_tokens))
    return subtokens

GLOBAL_SEP = "---"
def code_call_tokenizer(call):
    subtokens = set()
    parts = re.split('\.|\>|\:|\-|\<',call)
    if len(parts) > 0:
        funct = parts[len(parts)-1]
        if not " " in funct:
            subtokens.add(funct.replace("_","").lower())
    return subtokens
def code_raw_tokenizer(call):
    subtokens = set()
    call = call.replace("\\t", "").replace("\\n", "")
    parts = re.split('\.|\>|\:|\-|\<|\(|\)|\"|=|\]|\[|\+|\,| ',call)
    if len(parts) > 0:
        for code in parts:
            if not " " in code:
                splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', code)).split()
                lower_tokens = [item.lower().strip() for item in splitted]
                subtokens.update(set(lower_tokens))
    return subtokens

def code_raw_control_tokenizer(con):
    subtokens = set()
    con = con.replace("\\t", "").replace("\\n", "").replace("&", "").replace("!", "").replace("|", "")
    parts = re.split('\.|\>|\:|\-|\<|\(|\)|\"|=|\]|\[|\+|\,|\!|\&|\|| ',con)
    if len(parts) > 0:
        for code in parts:
            if not " " in code:
                splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', code)).split()
                lower_tokens = [item.lower().strip() for item in splitted]
                subtokens.update(set(lower_tokens))
    return subtokens

In [None]:
identifier = 'file_system_indexer'
code_tokenizer(identifier)

In [None]:
def get_label(indx):
    if indx == 0:
        return 'dos'
    if indx == 1:
        return 'info'
    if indx == 2:
        return 'overflow'
    if indx == 3:
        return 'priv'
    if indx == 4:
        return 'mem'
    if indx == 5:
        return 'exec'
    if indx == 6:
        return 'bypass'

# PREPARE

In [None]:
def extractIdentifiers(idMap):
    result = set()
    for iden in idMap:
        id_txt = str(iden['name'])
        sub_tokens = code_tokenizer(id_txt)
        result.update(sub_tokens)
    return result

def extractAllIdentifiers(df):
    result = set()
    id_col = []
    for i in range(len(df)):
        ids = extractIdentifiers(df['elements'][i]['IDENTIFIER'])
        result.update(ids)
        id_col.append(ids)
    df['ids'] = id_col
    return result

def extractCalls(idMap):
    result = set()
    for iden in idMap:
        call_txt = str(iden['name'])
        sub_tokens = code_call_tokenizer(call_txt)
        result.update(sub_tokens)
    return result

def extractAllCalls(df):
    result = set()
    call_col = []
    for i in range(len(df)):
        calls = extractCalls(df['elements'][i]['CALL'])
        result.update(calls)
        call_col.append(calls)
    df['calls'] = call_col
    return result

def extractControls(idMap):
    result = set()
    for item in idMap:
        call_txt = str(item['code'])
        sub_tokens = code_raw_control_tokenizer(call_txt)
        
        result.update(sub_tokens)
    return result

def extractAllControls(df):
    result = set()
    control_col = []
    for i in range(len(df)):
        controls = extractControls(df['elements'][i]['CONTROL_STRUCTURE'])
        result.update(controls)
        control_col.append(controls)
    df['controls'] = control_col
    return result


In [None]:
ALL_TRAIN_IDENTIFIERS = extractAllIdentifiers(test)
ALL_TEST_IDENTIFIERS = extractAllIdentifiers(test)
ALL_TEST_CALLS = extractAllCalls(test)
ALL_TRAIN_CALLS = extractAllCalls(test)
ALL_TEST_CONTROLS = extractAllControls(test)
ALL_TRAIN_CONTROLS = extractAllControls(test)
train = test

In [None]:
# ALL_TRAIN_IDENTIFIERS = ALL_TEST_IDENTIFIERS
# ALL_TRAIN_CALLS = ALL_TEST_CALLS
# ALL_TRAIN_CONTROLS = ALL_TEST_CONTROLS

In [None]:
FREQ_THRESHOLD = YOUR_THRESHOLD

# IDENTIFIERS

In [None]:
GLOBAL_SEP = "---"

def computeFreq(df, label, ALL_IDENTIFIERS):
    items = df.loc[df[label] == 1]
    id_freq = {}
    for iden in ALL_IDENTIFIERS:
        id_freq[iden] = 0
        for ids in items['ids']:
            if iden in ids:
                id_freq[iden] = id_freq[iden]+1
    for iden in ALL_IDENTIFIERS:
        id_freq[iden] = 100*id_freq[iden]/len(id_freq)
    return id_freq

#target vs. others
def freq_stat(freqs, target, ALL_IDENTIFIERS):
    target_vs_others = {}
    target_id_freq = freqs[target]
    for iden in ALL_IDENTIFIERS:
        rel_freq_target = target_id_freq[iden]
        rel_freq_others = 0
        for i in range(len(freqs)):
            if i != target and freqs[i][iden] > rel_freq_others:
                rel_freq_others = freqs[i][iden]
        ratio = 0
        if rel_freq_others != 0:
            ratio = rel_freq_target/rel_freq_others
        elif rel_freq_target != 0:
            ratio = 1000
        if ratio > FREQ_THRESHOLD:
            target_vs_others[iden] = ratio
    return sorted(target_vs_others.items(), key=lambda x: x[1], reverse=True)

def infreq_stat(freqs, target, ALL_IDENTIFIERS):
    target_vs_others = {}
    target_id_freq = freqs[target]
    for iden in ALL_IDENTIFIERS:
        rel_freq_target = target_id_freq[iden]
        rel_freq_others = 10000
        for i in range(len(freqs)):
            if i != target and freqs[i][iden] < rel_freq_others:
                rel_freq_others = freqs[i][iden]
        ratio = 0
        if rel_freq_others != 0 and rel_freq_target != 0:
            ratio = rel_freq_others/rel_freq_target
        elif rel_freq_others != 0:
            ratio = 1000
        if ratio > FREQ_THRESHOLD:
            target_vs_others[iden] = ratio
    return sorted(target_vs_others.items(), key=lambda x: x[1], reverse=True)

# Call

In [None]:
def computeFreqCall(df, label, ALL_CALLS):
    items = df.loc[df[label] == 1]
    id_freq = {}
    for iden in ALL_CALLS:
        id_freq[iden] = 0
        for ids in items['calls']:
            if iden in ids:
                id_freq[iden] = id_freq[iden]+1
    for iden in ALL_CALLS:
        id_freq[iden] = 100*id_freq[iden]/len(id_freq)
    return id_freq

def freq_call_stat(freqs, target, ALL_CALLS):
    target_vs_others = {}
    target_call_freq = freqs[target]
    for call in ALL_CALLS:
        rel_freq_target = target_call_freq[call]
        rel_freq_others = 0
        for i in range(len(freqs)):
            if i != target and freqs[i][call] > rel_freq_others:
                rel_freq_others = freqs[i][call]
        rel_freq_others = rel_freq_others
        ratio = 0
        if rel_freq_others != 0:
            ratio = rel_freq_target/rel_freq_others
        elif rel_freq_target != 0:
            ratio = 1000
        if ratio > FREQ_THRESHOLD:
            target_vs_others[call] = ratio
    return sorted(target_vs_others.items(), key=lambda x: x[1], reverse=True)

def infreq_call_stat(freqs, target, ALL_CALLS):
    target_vs_others = {}
    target_call_freq = freqs[target]
    for call in ALL_CALLS:
        rel_freq_target = target_call_freq[call]
        rel_freq_others = 1000
        for i in range(len(freqs)):
            if i != target and freqs[i][call] < rel_freq_others:
                rel_freq_others = freqs[i][call]
        ratio = 0
        if rel_freq_others != 0 and rel_freq_target != 0:
            ratio = rel_freq_others/rel_freq_target
        elif rel_freq_others != 0:
            ratio = 1000
        if ratio > FREQ_THRESHOLD:
            target_vs_others[call] = ratio
    return sorted(target_vs_others.items(), key=lambda x: x[1], reverse=True)

# Control

In [None]:
def computeFreqControl(df, label, ALL_CONTROLS):
    items = df.loc[df[label] == 1]
    id_freq = {}
    for iden in ALL_CONTROLS:
        id_freq[iden] = 0
        for ids in items['controls']:
            if iden in ids:
                id_freq[iden] = id_freq[iden]+1
    for iden in ALL_CONTROLS:
        id_freq[iden] = 100*id_freq[iden]/len(id_freq)
    return id_freq

def freq_control_stat(freqs, target, ALL_CONTROLS):
    target_vs_others = {}
    target_con_freq = freqs[target]
    for con in ALL_CONTROLS:
        rel_freq_target = target_con_freq[con]
        rel_freq_others = 0
        for i in range(len(freqs)):
            if i != target and freqs[i][con] > rel_freq_others:
                rel_freq_others = freqs[i][con]
        rel_freq_others = rel_freq_others
        ratio = 0
        if rel_freq_others != 0:
            ratio = rel_freq_target/rel_freq_others
        elif rel_freq_target != 0:
            ratio = 1000
        if ratio > FREQ_THRESHOLD:
            target_vs_others[con] = ratio
    return sorted(target_vs_others.items(), key=lambda x: x[1], reverse=True)

def infreq_control_stat(freqs, target, ALL_CONTROLS):
    target_vs_others = {}
    target_con_freq = freqs[target]
    for con in ALL_CONTROLS:
        rel_freq_target = target_con_freq[con]
        rel_freq_others = 0
        for i in range(len(freqs)):
            if i != target and freqs[i][con] < rel_freq_others:
                rel_freq_others = freqs[i][con]
        ratio = 0
        if rel_freq_others != 0 and rel_freq_target != 0:
            ratio = rel_freq_others/rel_freq_target
        elif rel_freq_others != 0:
            ratio = 1000
        if ratio > FREQ_THRESHOLD:
            target_vs_others[con] = ratio
    return sorted(target_vs_others.items(), key=lambda x: x[1], reverse=True)

In [None]:
data = train
dos_id_freq_train = computeFreq(data, 'dos', ALL_TRAIN_IDENTIFIERS)
info_id_freq_train = computeFreq(data, 'info', ALL_TRAIN_IDENTIFIERS)
overflow_id_freq_train = computeFreq(data, 'overflow', ALL_TRAIN_IDENTIFIERS)
priv_id_freq_train = computeFreq(data, 'priv', ALL_TRAIN_IDENTIFIERS)
mem_id_freq_train = computeFreq(data, 'mem', ALL_TRAIN_IDENTIFIERS)
exec_id_freq_train = computeFreq(data, 'exec', ALL_TRAIN_IDENTIFIERS)
bypass_id_freq_train = computeFreq(data, 'bypass', ALL_TRAIN_IDENTIFIERS)

freqs_train = [dos_id_freq_train, info_id_freq_train, overflow_id_freq_train, priv_id_freq_train, 
         mem_id_freq_train, exec_id_freq_train, bypass_id_freq_train]


In [None]:
indx = 2
label = get_label(indx)
sub_tokens_map = freq_stat(freqs_train, indx, ALL_TRAIN_IDENTIFIERS)
sub_tokens_train = set()
data = test
for record in sub_tokens_map:
    sub_tokens_train.add(record[0])

test_tokens = data.loc[data[label] == 1]
counter = 0
for ids in test_tokens['ids']:
    intersec = sub_tokens_train.intersection(ids)
    if len(intersec) > 0:
        counter = counter+1

In [None]:
# sub_tokens_map

In [None]:
def extract_distinguishing_tokens(freqs, indx, all_calls):
    sub_tokens_map = freq_stat(freqs, indx, all_calls)
    sub_tokens_train = set()
    for record in sub_tokens_map:
        sub_tokens_train.add(record[0])
    return sub_tokens_train

In [None]:
# distinguishing_tokens_dos = extract_distinguishing_tokens(freqs_train, 0, ALL_TRAIN_IDENTIFIERS)
# distinguishing_tokens_info = extract_distinguishing_tokens(freqs_train, 1, ALL_TRAIN_IDENTIFIERS)
# distinguishing_tokens_overflow = extract_distinguishing_tokens(freqs_train, 2, ALL_TRAIN_IDENTIFIERS)
# distinguishing_tokens_priv = extract_distinguishing_tokens(freqs_train, 3, ALL_TRAIN_IDENTIFIERS)
# distinguishing_tokens_mem = extract_distinguishing_tokens(freqs_train, 4, ALL_TRAIN_IDENTIFIERS)
# distinguishing_tokens_exec = extract_distinguishing_tokens(freqs_train, 5, ALL_TRAIN_IDENTIFIERS)
# distinguishing_tokens_bypass = extract_distinguishing_tokens(freqs_train, 6, ALL_TRAIN_IDENTIFIERS)

In [None]:
# len(distinguishing_tokens_exec.intersection(distinguishing_tokens_bypass))

In [None]:
# len(distinguishing_tokens_dos)

# FUNCTION CALLS

In [None]:
data = train
dos_call_freq_train = computeFreqCall(data, 'dos', ALL_TRAIN_CALLS)
info_call_freq_train = computeFreqCall(data, 'info', ALL_TRAIN_CALLS)
overflow_call_freq_train = computeFreqCall(data, 'overflow', ALL_TRAIN_CALLS)
priv_call_freq_train = computeFreqCall(data, 'priv', ALL_TRAIN_CALLS)
mem_call_freq_train = computeFreqCall(data, 'mem', ALL_TRAIN_CALLS)
exec_call_freq_train = computeFreqCall(data, 'exec', ALL_TRAIN_CALLS)
bypass_call_freq_train = computeFreqCall(data, 'bypass', ALL_TRAIN_CALLS)

freqs_call_train = [dos_call_freq_train, info_call_freq_train, overflow_call_freq_train, priv_call_freq_train, 
         mem_call_freq_train, exec_call_freq_train, bypass_call_freq_train]

# CONTROL

In [None]:
data = train
dos_con_freq_train = computeFreqControl(data, 'dos', ALL_TRAIN_CONTROLS)
info_con_freq_train = computeFreqControl(data, 'info', ALL_TRAIN_CONTROLS)
overflow_con_freq_train = computeFreqControl(data, 'overflow', ALL_TRAIN_CONTROLS)
priv_con_freq_train = computeFreqControl(data, 'priv', ALL_TRAIN_CONTROLS)
mem_con_freq_train = computeFreqControl(data, 'mem', ALL_TRAIN_CONTROLS)
exec_con_freq_train = computeFreqControl(data, 'exec', ALL_TRAIN_CONTROLS)
bypass_con_freq_train = computeFreqControl(data, 'bypass', ALL_TRAIN_CONTROLS)

freqs_con_train = [dos_con_freq_train, info_con_freq_train, overflow_con_freq_train, priv_con_freq_train, 
         mem_con_freq_train, exec_con_freq_train, bypass_con_freq_train]

# TESTING

In [None]:
indx = 6
label = get_label(indx)
sub_tokens_call_map = freq_call_stat(freqs_call_train, indx, ALL_TRAIN_CALLS)
sub_tokens_id_map = freq_stat(freqs_train, indx, ALL_TRAIN_IDENTIFIERS)
sub_tokens_con_map = freq_control_stat(freqs_con_train, indx, ALL_TRAIN_CONTROLS)
sub_tokens_call_train = set()
sub_tokens_id_train = set()
sub_tokens_con_train = set()

data = test
for record in sub_tokens_call_map:
    sub_tokens_call_train.add(record[0])

for record in sub_tokens_id_map:
    sub_tokens_id_train.add(record[0])

for record in sub_tokens_con_map:
    sub_tokens_con_train.add(record[0])

test_tokens = data.loc[data[label] == 1]
counter = 0
for row in test_tokens.iterrows():
    intersec_call = sub_tokens_call_train.intersection(row[1]['calls'])
    intersec_id = sub_tokens_id_train.intersection(row[1]['ids'])
    intersec_con = sub_tokens_con_train.intersection(row[1]['controls'])
    if len(intersec_call) > 0 or len(intersec_id) or len(intersec_con) > 0:
        counter = counter+1
print(counter/len(test_tokens))

In [None]:
def get_label2(indx):
    if indx == 0:
        return 'dos'
    if indx == 1:
        return 'info'
    if indx == 2:
        return 'overflow'
    if indx == 3:
        return 'priv'
    if indx == 4:
        return 'mem'
    if indx == 5:
        return 'exec'
    if indx == 6:
        return 'bypass'
ft_df = pd.DataFrame()
for indx in range(7):
    label = get_label2(indx)
    sub_tokens_call_map = freq_call_stat(freqs_call_train, indx, ALL_TRAIN_CALLS)
    sub_tokens_id_map = freq_stat(freqs_train, indx, ALL_TRAIN_IDENTIFIERS)
    sub_tokens_con_map = freq_control_stat(freqs_con_train, indx, ALL_TRAIN_CONTROLS)
    sub_tokens_call_train = set()
    sub_tokens_id_train = set()
    sub_tokens_con_train = set()
    for record in sub_tokens_call_map:
        sub_tokens_call_train.add(record[0])
    for record in sub_tokens_id_map:
        sub_tokens_id_train.add(record[0])
    for record in sub_tokens_con_map:
        sub_tokens_con_train.add(record[0])
        
    infeq_sub_tokens_call_map = infreq_call_stat(freqs_call_train, indx, ALL_TRAIN_CALLS)
    infeq_sub_tokens_id_map = infreq_stat(freqs_train, indx, ALL_TRAIN_IDENTIFIERS)
    infeq_sub_tokens_con_map = infreq_control_stat(freqs_con_train, indx, ALL_TRAIN_CONTROLS)
    infeq_sub_tokens_call_train = set()
    infeq_sub_tokens_id_train = set()
    infeq_sub_tokens_con_train = set()
    for record in infeq_sub_tokens_call_map:
        infeq_sub_tokens_call_train.add(record[0])
    for record in infeq_sub_tokens_id_map:
        infeq_sub_tokens_id_train.add(record[0])
    for record in infeq_sub_tokens_con_map:
        infeq_sub_tokens_con_train.add(record[0])
    
    ft_df[label] = [sub_tokens_id_train,sub_tokens_call_train,sub_tokens_con_train, 
                    infeq_sub_tokens_id_train, infeq_sub_tokens_call_train, infeq_sub_tokens_con_train]

In [None]:
# freq_stat(freqs_train, 1, ALL_TRAIN_IDENTIFIERS)

In [None]:
ft_df['info'][0]

In [None]:
# ft_df.to_csv('features.csv',index=False, mode='a', header=True)

# TESTING

In [None]:
raw_preds = pd.read_csv('raw_preds.csv')
labels = pd.read_csv('labels.csv')

In [None]:
LABEL_COLUMNS = raw_preds.columns.tolist()

In [None]:
def hamming_score(preds, lbs):
    sum_ratio = 0.0
    for i, row in preds.iterrows():
        upper = 0
        lower = 0
        ratio = 0.0
        for lb in LABEL_COLUMNS:
            logic_and = preds[lb][i] + lbs[lb][i]
            if logic_and == 2:
                upper = upper+1
                lower = lower+1
            if logic_and == 1:
                lower = lower+1
        if lower != 0:
            ratio = upper/lower
        sum_ratio = sum_ratio+ratio
    return sum_ratio/len(preds)

def accuracy(preds, lbs):
    sum_ratio = 0.0
    for i, row in preds.iterrows():
        upper = 0
        ratio = 0.0
        for lb in LABEL_COLUMNS:
            if preds[lb][i] == lbs[lb][i]:
                upper = upper+1
            ratio = upper/len(LABEL_COLUMNS)
        sum_ratio = sum_ratio+ratio
    return sum_ratio/len(preds)

def exact_match(preds, lbs):
    sum_ratio = 0.0
    for i, row in preds.iterrows():
        upper = 0
        for lb in LABEL_COLUMNS:
            if preds[lb][i] == lbs[lb][i]:
                upper = upper+1
        if upper == len(LABEL_COLUMNS):
            sum_ratio = sum_ratio+1
    return sum_ratio/len(preds)

In [None]:
FREQ_THRESHOLD

In [None]:
def real(preds, df, threshold=0.3):
    COUNTER = 0
    CORRECT = 0
    excl = {}
    for lb in LABEL_COLUMNS:
        for i in range(len(preds[lb])):
            if preds[lb][i] >= threshold:
                preds[lb][i] = 1
            else:
                preds[lb][i] = 0
    for index, row in preds.iterrows():
        for lb in set(LABEL_COLUMNS):
            if lb in ft_df:
                sub_tokens_id_train = ft_df[lb][0]
                sub_tokens_call_train = ft_df[lb][1]
                sub_tokens_con_train = ft_df[lb][2]
                intersec_call = sub_tokens_call_train.intersection(df['calls'][index])
                intersec_id = sub_tokens_id_train.intersection(df['ids'][index])
                intersec_con = sub_tokens_con_train.intersection(df['controls'][index])
                if preds[lb][index] == 0 and (len(intersec_id) > 0 and len(intersec_call) > 0):
                    preds[lb][index] = 1
                    COUNTER = COUNTER+1
                    if labels[lb][index] == 1:
                        CORRECT = CORRECT+1
                
                infreq_sub_tokens_id_train = ft_df[lb][3]
                infreq_sub_tokens_call_train = ft_df[lb][4]
                infreq_sub_tokens_con_train = ft_df[lb][5]
                infreq_intersec_call = infreq_sub_tokens_call_train.intersection(df['calls'][index])
                infreq_intersec_id = infreq_sub_tokens_id_train.intersection(df['ids'][index])
                infreq_intersec_con = infreq_sub_tokens_con_train.intersection(df['controls'][index])
                if preds[lb][index] == 1 and (len(infreq_intersec_call) > 0 and len(infreq_intersec_id) > 0 and len(infreq_intersec_con) > 0):
                    preds[lb][index] = 0
                    COUNTER = COUNTER+1
                    if labels[lb][index] == 0:
                        CORRECT = CORRECT+1
    print(COUNTER)
    print(CORRECT/COUNTER)
#CodeBERT
preds = pd.read_csv('VIT_OUTPUT')

THRESHOLD = YOUR_THRESHOLD
real(preds, test, THRESHOLD)
print("exact_match: "+str(exact_match(preds, labels)))
print("hamming_score: "+ str(hamming_score(preds, labels)))
print("accuracy: "+str(accuracy(preds, labels)))


In [None]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix
# preds = pd.read_csv('raw_preds.csv')
real(preds, test, THRESHOLD)
y_pred = preds.to_numpy()
y_true = labels.to_numpy()
print(classification_report(
  y_true, 
  y_pred, 
  target_names=LABEL_COLUMNS, 
  zero_division=0
))