In [None]:
import pandas as pd
import hashlib
import json

# Set cutoff
cutoff = 1000

# Load datasets
cvefixes = pd.read_csv("~/TransVulDet/datasets_/CVEfixes_new.csv")
msr = pd.read_csv("~/TransVulDet/datasets_/MSR.csv")
classpathmap = pd.read_csv("~/classpaths.csv")


In [None]:

# Add hash column for code, to check for uniqueness
cvefixes['code.digest'] = cvefixes['code'].apply(lambda x: hashlib.sha1(x.encode()).hexdigest())
msr['code.digest'] = msr['code'].apply(lambda x: hashlib.sha1(x.encode()).hexdigest())

# Remove code column
cvefixes.drop(columns=['code'], inplace=True)
msr.drop(columns=['code'], inplace=True)
msr.drop(columns=['Unnamed: 0'], inplace=True)  # Remove row number column

# Combine datasets
alltbl = pd.concat([cvefixes, msr]).drop_duplicates()

# Create a DataFrame with counts for each cwe_id
cwecounts = alltbl.groupby('cwe_id').size().reset_index(name='count')

# Write cwecounts to a CSV file
cwecounts.to_csv("cwecounts.csv", index=False)

# Load the JSON file representing the hierarchical structure
with open("cwe_hierarchy.json", "r") as json_file:
    cwe_hierarchy = json.load(json_file)

# Define a function to split and reassign CWE IDs based on the hierarchy
def split_and_reassign_cwe_ids(cweiter, hierarchy):
    # Iterate through the hierarchy
    for cwe_id, children in hierarchy.items():
        # Check if there are more than two children and their total count exceeds the cutoff
        if len(children) >= 2 and sum(cweiter[cweiter['cwe_id'].isin(children)]['count']) >= cutoff:
            for child in children:
                # Split assignedclass for each child
                cweiter = splitclass(cweiter, child)
        else:
            # Group all children and sum up their counts to the parent node
            parent_count = cweiter[cweiter['cwe_id'].isin(children)]['count'].sum()
            cweiter.loc[cweiter['cwe_id'] == cwe_id, 'count'] += parent_count
    
    return cweiter

# Initialize cweiter with the starting cwe_id
cweiter = cwecounts.copy()
cweiter['assignedclass'] = "10000"

while True:
    previous_cweiter = cweiter.copy()
    cweiter = split_and_reassign_cwe_ids(cweiter, cwe_hierarchy)
    # Check if cweiter has changed after splitting and reassigning
    if previous_cweiter.equals(cweiter):
        break

# Calculate statistics
aclass_stats = cweiter[['assignedclass', 'count']].drop_duplicates()

# Map old cwe_id to new cwe_id
cwechanges = cweiter[['cwe_id', 'assignedclass']].drop_duplicates()

# Filter cwechanges to remove rows where count is less than the cutoff
cwechanges = cwechanges.merge(aclass_stats[['assignedclass', 'count']], left_on='assignedclass', right_on='assignedclass', how='inner')
cwechanges = cwechanges[cwechanges['count'] >= cutoff]

# Write old -> new cwe_ids to a CSV file
cwechanges.to_csv("cwechanges.csv", index=False)

# Print aclass_stats in descending order of count
print(aclass_stats.sort_values(by='count', ascending=False))

# Print the number of rows in aclass_stats where count is greater than or equal to cutoff
print(len(aclass_stats[aclass_stats['count'] >= cutoff]))

# Calculate the percentage of samples dropping if we used 'cutoff'
percent_samples_dropping = cweiter[cweiter['count'] < cutoff]['count'].sum() / cweiter['count'].sum()
print(percent_samples_dropping)
