Load Real Resumes

In [3]:
import json
import os

def extract_text_recursive(data):
    """
    Recursively extract text from a dictionary or list.
    Flatten nested structures and combine all string values.
    """
    if isinstance(data, dict):
        return " ".join(extract_text_recursive(value) for value in data.values())
    elif isinstance(data, list):
        return " ".join(extract_text_recursive(item) for item in data)
    elif isinstance(data, str):
        return data
    else:
        # If data is neither a dict, list, nor string (e.g., number, None), convert it to a string
        return str(data)

def load_real_resumes_from_json(folder_path):
    real_resumes = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):  # Ensure we're only processing JSON files
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                try:
                    data = json.load(file)
                    # Combine all key values as a single text field
                    resume_text = extract_text_recursive(data)
                    real_resumes.append({"text": resume_text, "label": 0})  # Label 0 for human-written
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in file {file_path}. Skipping this file.")
    return real_resumes

# Load real resumes
real_resumes = load_real_resumes_from_json("D:/Lusak.tech/Dataset")
print(f"Loaded {len(real_resumes)} real resumes.")

Loaded 26 real resumes.


Load Fake Resumes

In [6]:
import os
import json


def extract_text_from_dict(data):
    """ Recursively extract all strings from a dictionary. """
    texts = []
    if isinstance(data, dict):
        for value in data.values():
            texts.extend(extract_text_from_dict(value))  # Recursive call for nested dictionaries
    elif isinstance(data, list):
        for item in data:
            texts.extend(extract_text_from_dict(item))  # Recursive call for lists
    elif isinstance(data, str):
        texts.append(data)  # Append string values only
    return texts


def load_fake_resumes_from_json(folder_path):
    fake_resumes = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):  # Process JSON files only
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                data = json.load(file)
                # Extract all text content from the JSON structure
                resume_text = " ".join(extract_text_from_dict(data))
                fake_resumes.append({"text": resume_text, "label": 1})  # Label 1 for AI-generated
    return fake_resumes


# Load fake resumes
fake_resumes = load_fake_resumes_from_json("D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes")
print(f"Loaded {len(fake_resumes)} fake resumes.")


Loaded 339 fake resumes.


Combine Datasets

In [8]:
combined_resumes = real_resumes + fake_resumes
print(f"Total resumes: {len(combined_resumes)}")

import random
random.shuffle(combined_resumes)

Total resumes: 365


Save Combined Dataset


In [9]:
import pandas as pd

def save_combined_dataset(resumes: list, output_file: str):
    df = pd.DataFrame(resumes)  # Convert to DataFrame
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Combined dataset saved to {output_file}")

# Save the dataset
save_combined_dataset(combined_resumes, "combined_resumes.csv")

Combined dataset saved to combined_resumes.csv
