In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import os

# Initialize Faker for generating realistic names and addresses
fake = Faker('en_US')

# --- Configuration ---
NUM_ROWS = 10000
NUM_FEATURES = 20
OUTPUT_FILE = r'C:\DATA\data_4.2.csv' # Using a raw string for the path

# Create the directory if it doesn't exist
output_dir = os.path.dirname(OUTPUT_FILE)
if not os.path.exists(output_dir):
    print(f"Directory '{output_dir}' does not exist. Creating it now.")
    os.makedirs(output_dir)

# --- Define Features and Data Generation Logic ---

# Feature names. We'll use some specific ones and then generic ones to reach 20.
feature_names = [
    'person_id', 'first_name', 'last_name', 'gender', 'age', 'job', 'height_cm',
    'weight_kg', 'city', 'state', 'country', 'salary', 'years_experience',
    'education_level', 'marital_status', 'number_of_children', 'favorite_color',
    'has_pet', 'car_ownership', 'feature_20'
]

# Possible values for categorical features
genders = ['Male', 'Female', 'Non-binary']
jobs = [
    'Engineer', 'Teacher', 'Doctor', 'Programmer', 'Artist', 'Scientist',
    'Manager', 'Journalist', 'Librarian', 'Chef', 'Pilot'
]
education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
marital_statuses = ['Single', 'Married', 'Divorced', 'Widowed']
colors = ['Red', 'Blue', 'Green', 'Yellow', 'Black', 'White', 'Purple', 'Orange']
boolean_choices = [True, False]

# Generate data for each feature
data = {
    'person_id': range(1, NUM_ROWS + 1),
    'first_name': [fake.first_name() for _ in range(NUM_ROWS)],
    'last_name': [fake.last_name() for _ in range(NUM_ROWS)],
    'gender': np.random.choice(genders, NUM_ROWS),
    'age': np.random.randint(18, 70, NUM_ROWS),
    'job': np.random.choice(jobs, NUM_ROWS),
    'height_cm': np.random.normal(175, 10, NUM_ROWS).astype(int), # Mean 175cm, std dev 10cm
    'weight_kg': np.random.normal(75, 15, NUM_ROWS).astype(int), # Mean 75kg, std dev 15kg
    'city': [fake.city() for _ in range(NUM_ROWS)],
    'state': [fake.state_abbr() for _ in range(NUM_ROWS)],
    'country': ['USA'] * NUM_ROWS, # Assuming all people are in the USA for simplicity
    'salary': np.random.randint(30000, 150000, NUM_ROWS),
    'years_experience': np.random.randint(0, 40, NUM_ROWS),
    'education_level': np.random.choice(education_levels, NUM_ROWS),
    'marital_status': np.random.choice(marital_statuses, NUM_ROWS),
    'number_of_children': np.random.randint(0, 5, NUM_ROWS),
    'favorite_color': np.random.choice(colors, NUM_ROWS),
    'has_pet': np.random.choice(boolean_choices, NUM_ROWS),
    'car_ownership': np.random.choice(boolean_choices, NUM_ROWS),
    'feature_20': np.random.rand(NUM_ROWS) # A generic random float feature
}

# Create the DataFrame
df = pd.DataFrame(data)

# Ensure the column order matches the feature_names list
df = df[feature_names]

# Save the DataFrame to a CSV file
try:
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"Successfully generated and saved dataset to '{OUTPUT_FILE}'")
except Exception as e:
    print(f"An error occurred: {e}")


Successfully generated and saved dataset to 'C:\DATA\data_4.2.csv'
