In [32]:
# pip install charmap

In [33]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

In [34]:
def save_json_without_escape(filepath, dataframe):
    with open(filepath, 'w', encoding="utf-8") as f:
        json.dump(dataframe.to_dict(orient='records'), f, ensure_ascii=False, indent=4)
        
# Load your dataset
with open('dataset/haveMD_302.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [35]:

df = pd.DataFrame(data)

# Categories of interest
categories_of_interest = {"Generic policy", "Reporting mechanism", "Scope of practice", "User guideline"}

splits = {}
for category in categories_of_interest:
    df[f'has_{category.replace(" ", "_").lower()}'] = df['SecurityPolicy_content_category'].apply(
        lambda x: category in x if isinstance(x, list) else False
    )
    
    X = df.drop(f'has_{category.replace(" ", "_").lower()}', axis=1)
    y = df[f'has_{category.replace(" ", "_").lower()}']
    
    # Stratified train-test split 70-30
    train_df, temp_df = train_test_split(df, test_size=0.3, stratify=y, random_state=42)
    
    # Split temp_df into validation and test sets 20-10
    validation_df, test_df = train_test_split(temp_df, test_size=(1/3), stratify=temp_df[f'has_{category.replace(" ", "_").lower()}'], random_state=42)
    
    splits[category] = {"train": train_df, "val": validation_df, "test": test_df}
    
    # Print the counts and distributions for verification
    print(f"Category: {category}\n")
    print("Train dataset:")
    print(train_df[f'has_{category.replace(" ", "_").lower()}'].value_counts(normalize=True))
    print()
    
    print("Validation dataset:")
    print(validation_df[f'has_{category.replace(" ", "_").lower()}'].value_counts(normalize=True))
    print()
    
    print("Test dataset:")
    print(test_df[f'has_{category.replace(" ", "_").lower()}'].value_counts(normalize=True))
    print("-" * 40)

# Save processed data
for category, datasets in splits.items():
    for split_name, dataset in datasets.items():
        save_json_without_escape(f"set/{category.replace(' ', '_')}_{split_name}.json", dataset)

print("Data preprocessing complete. JSON files saved.")


Category: Generic policy

Total repositories in dataset: 302
Train dataset:
has_generic_policy
True     0.687204
False    0.312796
Name: proportion, dtype: float64
Total projects in Training set: 211

Validation dataset:
has_generic_policy
True     0.683333
False    0.316667
Name: proportion, dtype: float64
Total projects in Validation set: 60

Test dataset:
has_generic_policy
True     0.677419
False    0.322581
Name: proportion, dtype: float64
Total projects in Test set: 31
----------------------------------------
Category: Scope of practice

Total repositories in dataset: 302
Train dataset:
has_scope_of_practice
False    0.587678
True     0.412322
Name: proportion, dtype: float64
Total projects in Training set: 211

Validation dataset:
has_scope_of_practice
False    0.583333
True     0.416667
Name: proportion, dtype: float64
Total projects in Validation set: 60

Test dataset:
has_scope_of_practice
False    0.580645
True     0.419355
Name: proportion, dtype: float64
Total projects in 

In [37]:
from collections import Counter

def read_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return pd.DataFrame(json.load(f))

categories_of_interest = {"Generic policy", "Reporting mechanism", "Scope of practice", "User guideline"}

# Binary encoding for each category
for category in categories_of_interest:
    train_df = read_json(f'set/{category.replace(' ', '_')}_train.json')
    validation_df = read_json(f'set/{category.replace(' ', '_')}_val.json')
    test_df = read_json(f'set/{category.replace(' ', '_')}_test.json')

datasets = {"Train": train_df, "Validation": validation_df, "Test": test_df}

# Process each set
for dataset_name, dataset in datasets.items():
    # Total number of packages
    total_packages = len(dataset)
    print(f"\n{dataset_name} Dataset: Total Packages = {total_packages}")
    
    # Count packages for each category
    category_counts = Counter(
        category for categories in dataset['SecurityPolicy_content_category'] for category in categories
    )
    print(f"Category Counts in {dataset_name} Dataset:")
    for category, count in category_counts.items():

        print(f"{category}: {count}")
    


Train Dataset: Total Packages = 211
Category Counts in Train Dataset:
Generic policy: 145
Scope of practice: 93
Reporting mechanism: 202
User guideline: 87
Projects practice: 16
History of vulnerability: 8
Information on maintainer: 4
Additional information: 4

Validation Dataset: Total Packages = 60
Category Counts in Validation Dataset:
Generic policy: 44
Scope of practice: 22
Reporting mechanism: 57
Additional information: 2
User guideline: 21
Projects practice: 3

Test Dataset: Total Packages = 31
Category Counts in Test Dataset:
Generic policy: 21
User guideline: 5
Reporting mechanism: 23
Scope of practice: 11
Projects practice: 1
Additional information: 1
