In [17]:
from pathlib import Path
import pandas as pd
import os
import random

In [18]:
DATASET_PATH = Path("../datasets/fire14-source-code-training-dataset/java")
PAIRS_PATH = Path("../datasets/fire14-source-code-training-dataset/SOCO14-java.qrel")

In [19]:
def load_java_files(java_directory):
    return set(f for f in os.listdir(java_directory) if f.endswith('.java'))

def load_plagiarized_pairs(qrel_filepath):
    plagiarized_pairs = set()
    with open(qrel_filepath, 'r') as f:
        for line in f:
            file1, file2 = line.strip().split()
            plagiarized_pairs.add(tuple(sorted([file1, file2])))
    return plagiarized_pairs

def create_dataset(java_directory, qrel_filepath, negative_ratio=5):
    files = load_java_files(java_directory)
    plagiarized_pairs = load_plagiarized_pairs(qrel_filepath)

    data = []
    
    all_files = list(files)

    for file1, file2 in plagiarized_pairs:
        if file1 in files and file2 in files:
            data.append((file1, file2, 1))

            negatives_added = 0
            tries = 0
            max_tries = 50

            while negatives_added < negative_ratio and tries < max_tries:
                random_file = random.choice(all_files)
                if random_file != file1 and random_file != file2:
                    if tuple(sorted([file1, random_file])) not in plagiarized_pairs and \
                       tuple(sorted([file2, random_file])) not in plagiarized_pairs:
                        data.append((file1, random_file, 0))
                        negatives_added += 1
                tries += 1

    df = pd.DataFrame(data, columns=["id1", "id2", "plagio"])
    return df

In [20]:
dataset = create_dataset(DATASET_PATH, PAIRS_PATH, negative_ratio=5)
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.to_csv('../labels/fire14-labels.csv', index=False)
print(f"Dataset creado con {len(dataset)} pares.")

Dataset creado con 504 pares.
