<a href="https://colab.research.google.com/github/shezankazi/linkedin_stuff/blob/main/create_mock_candidate_company_job_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install & Import Dependencies

In [None]:
!pip install google-generativeai google-cloud-bigquery google-ai-generativelanguage

In [None]:
import json
import uuid
import google.generativeai as genai
from google.ai.generativelanguage_v1beta.types import content
from google.cloud import bigquery


Configure the Gemini API

In [None]:
# Configure the Gemini API key.
genai.configure(api_key="[YOUR API KEY FOR GEMINI]")


Define Data Generation Functions

In [None]:
def generate_companies(num_companies=50, filename='companies.json'):
    """
    Generate synthetic companies using a Gemini model and save them to a JSON file.
    """
    # Define the response schema for companies.
    company_schema = content.Schema(
        type=content.Type.OBJECT,
        required=[
            "name", "website", "city", "street", "postal_code", "country",
            "industry", "company_description", "fte_size", "revenue_size"
        ],
        properties={
            "name": content.Schema(type=content.Type.STRING),
            "website": content.Schema(type=content.Type.STRING),
            "city": content.Schema(type=content.Type.STRING),
            "street": content.Schema(type=content.Type.STRING),
            "postal_code": content.Schema(type=content.Type.STRING),
            "country": content.Schema(type=content.Type.STRING),
            "industry": content.Schema(type=content.Type.STRING),
            "company_description": content.Schema(type=content.Type.STRING),
            "fte_size": content.Schema(type=content.Type.STRING),
            "revenue_size": content.Schema(type=content.Type.STRING),
        }
    )

    company_generation_config = {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 8192,
        "response_schema": company_schema,
        "response_mime_type": "application/json",
    }

    company_system_instruction = (
        "You are a data analyst working for Randstad. You need to create synthetic datasets for GDPR compliant UAT testing.\n\n"
        "Generate a synthetic company record for one of the 50 biggest companies in the logistics sector in the Netherlands.\n\n"
        "Schema:\n"
        "  id: INTEGER <-- primary key\n"
        "  name: STRING <-- company legal name\n"
        "  website: STRING\n"
        "  city: STRING <-- head office address\n"
        "  street: STRING\n"
        "  postal_code: STRING\n"
        "  country: STRING\n"
        "  industry: STRING <-- main industry they operate in\n"
        "  company_description: STRING <-- detailed description (at least 100 words)\n"
        "  fte_size: STRING <-- number of employees\n"
        "  revenue_size: STRING <-- annual revenues\n\n"
        "Ensure that the company is located in the Netherlands and is in the logistics sector."
    )

    # Create the model instance for companies.
    company_model = genai.GenerativeModel(
        model_name="gemini-2.0-flash",
        generation_config=company_generation_config,
        system_instruction=company_system_instruction,
    )

    # Start a chat session.
    company_chat_session = company_model.start_chat(
        history=[
            {"role": "user", "parts": ["Create a synthetic company record in the logistics sector in the Netherlands according to the provided schema."]},
            {"role": "model", "parts": ["`json\n{\n  \"name\": \"\",\n  \"website\": \"\",\n  \"city\": \"\",\n  \"street\": \"\",\n  \"postal_code\": \"\",\n  \"country\": \"\",\n  \"industry\": \"\",\n  \"company_description\": \"\",\n  \"fte_size\": \"\",\n  \"revenue_size\": \"\"\n}`"]}
        ]
    )

    companies = []
    print(f"Generating {num_companies} companies...")
    for i in range(num_companies):
        prompt_text = f"Generate synthetic data for a large logistics company in the Netherlands. This is company number {i+1}."
        response = company_chat_session.send_message(prompt_text)
        try:
            company_record = json.loads(response.text)
            company_record["id"] = i + 1  # Sequential primary key.
            companies.append(company_record)
            # Save to file after each generation.
            with open(filename, 'w', encoding='utf-8') as jsonfile:
                json.dump(companies, jsonfile, indent=4, ensure_ascii=False)
            print(f"Generated and saved company {i+1}")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for company {i+1}: {e}\nResponse: {response.text}")
        except Exception as e:
            print(f"Unexpected error during company {i+1}: {e}")

    print(f"Finished generating {num_companies} companies. Data saved to '{filename}'.")
    return companies


def generate_jobs(companies, filename='jobs.json'):
    """
    Generate synthetic job postings for each company and save them to a JSON file.
    For each company, generate 10 logistics and 10 non-logistics job postings.
    """
    # Define the response schema for job postings.
    job_schema = content.Schema(
        type=content.Type.OBJECT,
        required=["title", "requirements", "tasks", "intro", "benefits", "others"],
        properties={
            "title": content.Schema(type=content.Type.STRING),
            "requirements": content.Schema(type=content.Type.STRING),
            "tasks": content.Schema(type=content.Type.STRING),
            "intro": content.Schema(type=content.Type.STRING),
            "benefits": content.Schema(type=content.Type.STRING),
            "others": content.Schema(type=content.Type.STRING),
        }
    )

    job_generation_config = {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 8192,
        "response_schema": job_schema,
        "response_mime_type": "application/json",
    }

    job_system_instruction = (
        "You are a data analyst working for Randstad. You need to create synthetic job postings for GDPR compliant UAT testing.\n\n"
        "Generate a synthetic job posting according to the schema below:\n"
        "  id: INTEGER <-- primary key\n"
        "  company: INTEGER <-- foreign key to company table (company id)\n"
        "  title: STRING <-- job title\n"
        "  requirements: STRING <-- job requirements\n"
        "  tasks: STRING <-- job tasks\n"
        "  intro: STRING <-- introductory text\n"
        "  benefits: STRING\n"
        "  others: STRING <-- additional information\n\n"
        "Ensure the job posting is realistic and detailed. For logistics roles, focus on operational, warehouse, and transportation tasks; for non-logistics roles, focus on corporate, administrative, or specialized business tasks. The location is in the Netherlands."
    )

    # Create the model instance for jobs.
    job_model = genai.GenerativeModel(
        model_name="gemini-2.0-flash",
        generation_config=job_generation_config,
        system_instruction=job_system_instruction,
    )

    job_chat_session = job_model.start_chat(
        history=[
            {"role": "user", "parts": ["Create a synthetic job posting according to the provided schema for a company based in the Netherlands."]},
            {"role": "model", "parts": ["`json\n{\n  \"title\": \"\",\n  \"requirements\": \"\",\n  \"tasks\": \"\",\n  \"intro\": \"\",\n  \"benefits\": \"\",\n  \"others\": \"\"\n}`"]}
        ]
    )

    jobs = []
    job_id_counter = 1
    print("Generating jobs for each company...")
    for company in companies:
        company_id = company.get("id")
        # Generate 10 logistics jobs.
        for j in range(10):
            prompt_text = (
                f"Generate a synthetic job posting for a logistics role at a logistics company in the Netherlands. "
                f"This is logistics job number {j+1} for company id {company_id}. Focus on operational, warehouse, and transportation-related tasks."
            )
            response = job_chat_session.send_message(prompt_text)
            try:
                job_record = json.loads(response.text)
                job_record["id"] = job_id_counter
                job_record["company"] = company_id
                jobs.append(job_record)
                job_id_counter += 1
                with open(filename, 'w', encoding='utf-8') as jsonfile:
                    json.dump(jobs, jsonfile, indent=4, ensure_ascii=False)
                print(f"Generated and saved logistics job {j+1} for company {company_id}")
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON for logistics job {j+1} (company {company_id}): {e}\nResponse: {response.text}")
            except Exception as e:
                print(f"Unexpected error for logistics job {j+1} (company {company_id}): {e}")

        # Generate 10 non-logistics jobs.
        for j in range(10):
            prompt_text = (
                f"Generate a synthetic job posting for a non-logistics role (e.g., HR, Finance, Marketing, IT) at a logistics company in the Netherlands. "
                f"This is non-logistics job number {j+1} for company id {company_id}. Focus on corporate, administrative, or specialized business tasks."
            )
            response = job_chat_session.send_message(prompt_text)
            try:
                job_record = json.loads(response.text)
                job_record["id"] = job_id_counter
                job_record["company"] = company_id
                jobs.append(job_record)
                job_id_counter += 1
                with open(filename, 'w', encoding='utf-8') as jsonfile:
                    json.dump(jobs, jsonfile, indent=4, ensure_ascii=False)
                print(f"Generated and saved non-logistics job {j+1} for company {company_id}")
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON for non-logistics job {j+1} (company {company_id}): {e}\nResponse: {response.text}")
            except Exception as e:
                print(f"Unexpected error for non-logistics job {j+1} (company {company_id}): {e}")

    print(f"Finished generating jobs. Total jobs: {len(jobs)}. Data saved to '{filename}'.")


def generate_candidate_profiles(num_profiles=20, filename='candidate_profiles.json'):
    """
    Generate synthetic candidate profiles using a Gemini model and save them to a JSON file.
    The candidate schema has been updated to use new key names.
    """
    candidate_generation_config = {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 8192,
        "response_schema": content.Schema(
            type=content.Type.OBJECT,
            required=[
                "name__givenName", "name__familyName", "name__prefix",
                "address__postalCode", "address__city", "address__street_number",
                "address__street", "phone", "email", "current_role", "cv__raw"
            ],
            properties={
                "name__givenName": content.Schema(type=content.Type.STRING),
                "name__familyName": content.Schema(type=content.Type.STRING),
                "name__prefix": content.Schema(type=content.Type.STRING),
                "address__postalCode": content.Schema(type=content.Type.STRING),



Define BigQuery Upload Functions

In [None]:
# Update these constants with your project and dataset IDs.
PROJECT_ID = "[YOUR PROJECT ID]"
DATASET_ID = "[YOUR DATASET ID]"

def get_table_ids(client):
    """Return fully-qualified table IDs for companies, jobs, and candidates."""
    project = client.project
    return {
        "companies": f"{project}.{DATASET_ID}.companies__mock",
        "jobs": f"{project}.{DATASET_ID}.jobs__mock",
        "candidates": f"{project}.{DATASET_ID}.candidates__mock",
    }

def get_schemas():
    """Define and return BigQuery table schemas."""
    companies_schema = [
        bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("name", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("website", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("city", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("street", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("postal_code", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("country", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("industry", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("company_description", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("fte_size", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("revenue_size", "STRING", mode="NULLABLE"),
    ]

    jobs_schema = [
        bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("company", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("title", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("requirements", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("tasks", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("intro", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("benefits", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("others", "STRING", mode="NULLABLE"),
    ]

    candidates_schema = [
        bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("cv__raw", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("email", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("phone", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("current_role", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("name__givenName", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("name__familyName", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("name__prefix", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("address__postalCode", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("address__city", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("address__street_number", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("address__street", "STRING", mode="REQUIRED"),
    ]

    return companies_schema, jobs_schema, candidates_schema

def create_table_if_not_exists(client, table_id, schema):
    """Create a table with the specified schema if it doesn't exist."""
    try:
        client.get_table(table_id)
        print(f"Table {table_id} already exists.")
    except Exception:
        table = bigquery.Table(table_id, schema=schema)
        client.create_table(table)
        print(f"Created table {table_id}.")

def insert_rows(client, table_id, rows, label):
    """Insert rows into a BigQuery table and print the result."""
    errors = client.insert_rows_json(table_id, rows)
    if errors:
        print(f"Errors inserting {label}:", errors)
    else:
        print(f"Inserted {label} successfully.")

def load_json(file_path):
    """Load JSON data from the given file path."""
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def transform_candidates(data):
    """Transform raw candidate data to match the BigQuery schema."""
    return [
        {
            "id": str(uuid.uuid4()),
            "cv__raw": candidate.get("cv__raw"),
            "email": candidate.get("email"),
            "phone": candidate.get("phone"),
            "current_role": candidate.get("current_role"),
            "name__givenName": candidate.get("name__givenName"),
            "name__familyName": candidate.get("name__familyName"),
            "name__prefix": candidate.get("name__prefix"),
            "address__postalCode": candidate.get("address__postalCode"),
            "address__city": candidate.get("address__city"),
            "address__street_number": candidate.get("address__street_number"),
            "address__street": candidate.get("address__street"),
        }
        for candidate in data
    ]

def transform_companies(data):
    """Transform raw companies data to match the BigQuery schema."""
    return [
        {
            "id": company.get("id"),
            "name": company.get("name"),
            "website": company.get("website"),
            "city": company.get("city"),
            "street": company.get("street"),
            "postal_code": company.get("postal_code"),
            "country": company.get("country"),
            "industry": company.get("industry"),
            "company_description": company.get("company_description"),
            "fte_size": company.get("fte_size"),
            "revenue_size": company.get("revenue_size"),
        }
        for company in data
    ]

def transform_jobs(data):
    """Transform raw jobs data to match the BigQuery schema."""
    return [
        {
            "id": job.get("id"),
            "company": job.get("company"),
            "title": job.get("title"),
            "requirements": job.get("requirements"),
            "tasks": job.get("tasks"),
            "intro": job.get("intro"),
            "benefits": job.get("benefits"),
            "others": job.get("others"),
        }
        for job in data
    ]

def upload_data_to_bigquery():
    """Create tables (if needed) and upload generated data to BigQuery."""
    client = bigquery.Client(project=PROJECT_ID)
    table_ids = get_table_ids(client)
    companies_schema, jobs_schema, candidates_schema = get_schemas()

    # Create tables if they do not exist.
    create_table_if_not_exists(client, table_ids["companies"], companies_schema)
    create_table_if_not_exists(client, table_ids["jobs"], jobs_schema)
    create_table_if_not_exists(client, table_ids["candidates"], candidates_schema)

    # Insert Candidate Profiles.
    candidates_data = load_json("candidate_profiles.json")
    candidates_rows = transform_candidates(candidates_data)
    insert_rows(client, table_ids["candidates"], candidates_rows, "candidates")

    # Insert Companies Data.
    companies_data = load_json("companies.json")
    companies_rows = transform_companies(companies_data)
    insert_rows(client, table_ids["companies"], companies_rows, "companies")

    # Insert Jobs Data.
    jobs_data = load_json("jobs.json")
    jobs_rows = transform_jobs(jobs_data)
    insert_rows(client, table_ids["jobs"], jobs_rows, "jobs")


execute the workflow

In [None]:
# Generate synthetic data.
companies = generate_companies()
generate_jobs(companies)
generate_candidate_profiles()

# Upload the generated data to BigQuery.
upload_data_to_bigquery()
