In [4]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import json
from collections import Counter

In [3]:
with open('datset/haveMD_302.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)

categories_of_interest = {"Generic policy", "Reporting mechanism", "Scope of practice", "User guideline"}

# Filter rows where `SecurityPolicy_content_category` contains any of the categories of interest
df_filtered = df[df['SecurityPolicy_content_category'].apply(
    lambda x: any(category in categories_of_interest for category in x)
)].copy()


def assign_random_category(categories, valid_categories):
    categories = list(set(categories) & valid_categories)
    if categories:
        return random.choice(categories)
    return None

df_filtered['assigned_category'] = df_filtered['SecurityPolicy_content_category'].apply(
    lambda x: assign_random_category(x, categories_of_interest))

df_filtered = df_filtered[df_filtered['assigned_category'].notnull()]

# stratified train-test split 70-30
train_df, temp_df = train_test_split(
    df_filtered, test_size=0.3, stratify=df_filtered['assigned_category'], random_state=42
)

# split temp_df into val and test sets 20-10
validation_df, test_df = train_test_split(
    temp_df, test_size=(1/3), stratify=temp_df['assigned_category'], random_state=42
)

train_df = train_df.drop(columns=['assigned_category'])
validation_df = validation_df.drop(columns=['assigned_category'])
test_df = test_df.drop(columns=['assigned_category'])

def save_json_without_escape(filepath, dataframe):
    with open(filepath, 'w') as f:
        json.dump(dataframe.to_dict(orient='records'), f, ensure_ascii=False, indent=4)

save_json_without_escape('train.json', train_df)
save_json_without_escape('validation.json', validation_df)
save_json_without_escape('test.json', test_df)


In [5]:
def read_json(filepath):
    with open(filepath, 'r') as f:
        return pd.DataFrame(json.load(f))

train_df = read_json('train.json')
validation_df = read_json('validation.json')
test_df = read_json('test.json')

datasets = {"Train": train_df, "Validation": validation_df, "Test": test_df}

# Process each set
for dataset_name, dataset in datasets.items():
    # Total number of packages
    total_packages = len(dataset)
    print(f"\n{dataset_name} Dataset: Total Packages = {total_packages}")
    
    # Count packages for each category
    category_counts = Counter(
        category for categories in dataset['SecurityPolicy_content_category'] for category in categories
    )
    print(f"Category Counts in {dataset_name} Dataset:")
    for category, count in category_counts.items():
        print(f"{category}: {count}")


Train Dataset: Total Packages = 196
Category Counts in Train Dataset:
Reporting mechanism: 195
Generic policy: 148
Scope of practice: 87
User guideline: 84
History of vulnerability: 4
Information on maintainer: 3
Projects practice: 15
Additional information: 7

Validation Dataset: Total Packages = 56
Category Counts in Validation Dataset:
Reporting mechanism: 58
Generic policy: 43
Scope of practice: 26
User guideline: 17
History of vulnerability: 2
Projects practice: 4

Test Dataset: Total Packages = 28
Category Counts in Test Dataset:
Reporting mechanism: 29
Generic policy: 19
User guideline: 12
Scope of practice: 13
History of vulnerability: 2
Information on maintainer: 1
Projects practice: 1
