In [None]:
import random

import pandas as pd

from faker import Faker

In [32]:
# Define the data generator function
def generate_mock_data(schema, num_records, title):
    """
    Generates mock data based on the provided schema and number of records.

    Args:
        schema (dict): JSON schema with field names, data types, and additional info.
        num_records (int): Number of records to generate.
        output_file (str): File path for the generated Excel file.

    Returns:
        None
    """
    output_file = f"{title}_mock_data.csv"
    fake = Faker()
    data = {}

    # Supported field generators
    generators = {
        # Integer field generator with optional min and max range
        "int": lambda x: random.randint(x.get("min", 0), x.get("max", 100)),
        # Decimal field generator with optional min and max range
        "decimal": lambda x: round(
            random.uniform(x.get("min", 0), x.get("max", 100)), 2
        ),
        # Variable-length text field generator
        "varchar": lambda x: fake.text(max_nb_chars=x.get("length", 10)),
        # Date field generator within the current decade
        "date": lambda _: fake.date_this_decade().strftime("%Y-%m-%d"),
        # Email address generator
        "email": lambda _: fake.email(),
        # Phone number generator
        "phone": lambda _: fake.phone_number(),
        # City name generator
        "city": lambda _: fake.city(),
        # Country name generator
        "country": lambda _: fake.country(),
        # Person name generator
        "name": lambda _: fake.name(),
        # Job name generator
        "job": lambda _: fake.job(),
        # Enum field generator using provided options
        "enum": lambda x: random.choice(x.get("options", [])),
    }

    # Generate data for each field
    for field, props in schema.items():
        if props["type"] in generators:
            generator = generators[props["type"]]
            data[field] = [
                generator(props.get("options", {})) for _ in range(num_records)
            ]
        else:
            raise ValueError(f"Unsupported field type: {props['type']}")

    # Create DataFrame and export to Excel
    df = pd.DataFrame(data)
    # df.to_excel(output_file, index=False)
    df.to_csv(output_file, index=False, quotechar='"')
    print(f"Mock data successfully exported to file: {output_file}")

In [33]:
# Example schema
schema = {
    "CandidateID": {"type": "int", "options": {"min": 1, "max": 434000}},
    "Name": {"type": "name"},
    "Email": {"type": "email"},
    "Phone": {"type": "phone"},
    "City": {
        "type": "enum",
        "options": {
            "options": [
                "NYC",
                "Tokyo",
                "London",
                "Paris",
                "Sydney",
                "Dubai",
                "Shanghai",
                "Mumbai",
                "LA",
                "Singapore",
                "Rome",
                "Istanbul",
                "Barcelona",
                "Berlin",
                "Rio De Janerio",
            ]
        },
    },
    "Country": {
        "type": "enum",
        "options": {
            "options": [
                "US",
                "Japan",
                "UK",
                "France",
                "Australia",
                "UAE",
                "China",
                "India",
                "Singapore",
                "Italy",
                "Turkey",
                "Spain",
                "Germany",
                "Brazil",
            ]
        },
    },
    "ExperienceYears": {"type": "int", "options": {"min": 0, "max": 20}},
    "AppliedDate": {"type": "date"},
    "JobID": {"type": "int", "options": {"min": 1, "max": 500}},
    "Title": {"type": "job"},
    "Department": {
        "type": "enum",
        "options": {
            "options": [
                "IT",
                "HR",
                "Sales",
                "Marketing",
                "Finance",
                "Operations",
            ]
        },
    },
    "Location": {"type": "city"},
    "SalaryRangeMin": {"type": "int", "options": {"min": 50000, "max": 100000}},
    "SalaryRangeMax": {"type": "int", "options": {"min": 100000, "max": 200000}},
    "PostedDate": {"type": "date"},
    "ApplicationID": {"type": "int", "options": {"min": 1, "max": 1000}},
    "Status": {
        "type": "enum",
        "options": {"options": ["Applied", "Interviewing", "Hired", "Rejected"]},
    },
    "Score": {"type": "decimal", "options": {"min": 30, "max": 100}},
}

candidates_schema = {
    # Candidates Table
    "CandidateID": {"type": "int", "options": {"min": 1, "max": 434000}},
    "Name": {"type": "name"},
    "Email": {"type": "email"},
    "Phone": {"type": "phone"},
    "City": {
        "type": "enum",
        "options": {
            "options": [
                "NYC",
                "Tokyo",
                "London",
                "Paris",
                "Sydney",
                "Dubai",
                "Shanghai",
                "Mumbai",
                "LA",
                "Singapore",
                "Rome",
                "Istanbul",
                "Barcelona",
                "Berlin",
                "Rio De Janerio",
            ]
        },
    },
    "Country": {
        "type": "enum",
        "options": {
            "options": [
                "US",
                "Japan",
                "UK",
                "France",
                "Australia",
                "UAE",
                "China",
                "India",
                "Singapore",
                "Italy",
                "Turkey",
                "Spain",
                "Germany",
                "Brazil",
            ]
        },
    },
    "ExperienceYears": {"type": "decimal", "options": {"min": 0, "max": 20}},
    "AppliedDate": {"type": "date"},
}

jobs_schema = {
    # Jobs Table
    "JobID": {"type": "int", "options": {"min": 1, "max": 5000}},
    "Title": {"type": "job"},
    "Department": {
        "type": "enum",
        "options": {
            "options": [
                "IT",
                "HR",
                "Sales",
                "Marketing",
                "Finance",
                "Operations",
            ]
        },
    },
    "City": {
        "type": "enum",
        "options": {
            "options": [
                "NYC",
                "Tokyo",
                "London",
                "Paris",
                "Sydney",
                "Dubai",
                "Shanghai",
                "Mumbai",
                "LA",
                "Singapore",
                "Rome",
                "Istanbul",
                "Barcelona",
                "Berlin",
                "Rio De Janerio",
            ]
        },
    },
    "SalaryRangeMin": {"type": "int", "options": {"min": 50000, "max": 100000}},
    "SalaryRangeMax": {"type": "int", "options": {"min": 100000, "max": 200000}},
    "PostedDate": {"type": "date"},
}

applications_schema = {
    # Applications Table
    "ApplicationID": {"type": "int", "options": {"min": 1, "max": 3000}},
    "CandidateID": {
        "type": "int",
        "options": {"min": 1, "max": 1000},
    },  # Links to Candidates.CandidateID
    "JobID": {"type": "int", "options": {"min": 1, "max": 3500}},  # Links to Jobs.JobID
    "Status": {
        "type": "enum",
        "options": {"options": ["Applied", "Interviewing", "Hired", "Rejected"]},
    },
    "Score": {"type": "decimal", "options": {"min": 30, "max": 100}},
}

In [34]:
# Usage example
generate_mock_data(candidates_schema, 1000, "candiates")
generate_mock_data(jobs_schema, 1000, "jobs")
generate_mock_data(applications_schema, 1000, "applications")

Mock data successfully exported to file: candiates_mock_data.csv
Mock data successfully exported to file: jobs_mock_data.csv
Mock data successfully exported to file: applications_mock_data.csv
