In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def generate_diverse_dataset(num_entries):
    # Generate Name
    names = [f"Name{i:04d}" for i in range(1, num_entries + 1)]

    # Generate Age between 20 and 100
    ages = np.random.randint(20, 101, num_entries)

    # Generate Occupation
    occupations = np.random.choice(["Salaried", "Self-employed", "Business"], num_entries)

    # Encode categorical columns
    label_encoder = LabelEncoder()

    # Ensure all possible labels are seen by the encoder
    all_occupations = ["Salaried", "Self-employed", "Business"]
    label_encoder.fit(all_occupations)

    occupations_encoded = label_encoder.transform(occupations)

    # Generate PAN Card Numbers
    pan_card_numbers = [f"PAN{i:04d}X" for i in range(1, num_entries + 1)]

    # Generate Aadhar Card Numbers
    aadhar_card_numbers = [f"Aadhar{i:04d}" for i in range(1, num_entries + 1)]

    # Generate Bank Account Numbers
    bank_account_numbers = [f"BankAcc{i:04d}" for i in range(1, num_entries + 1)]

    # Generate Marital Status
    marital_statuses = np.random.choice(["Single", "Married"], num_entries)

    # Ensure all possible labels are seen by the encoder
    all_marital_statuses = ["Single", "Married"]
    label_encoder.fit(all_marital_statuses)

    marital_statuses_encoded = label_encoder.transform(marital_statuses)

    # Generate Children (Yes/No)
    children = np.random.choice(["No", "Yes"], num_entries)

    # Ensure all possible labels are seen by the encoder
    all_children_labels = ["No", "Yes"]
    label_encoder.fit(all_children_labels)

    children_encoded = label_encoder.transform(children)

    # Generate Reported Income between 10,000 and 2,000,000
    reported_income = np.random.randint(10000, 2000001, num_entries)

    # Generate other financial features with sparsity and outliers
    interest_income = np.round(np.where(np.random.rand(num_entries) < 0.2, 0, reported_income * 0.1))
    business_income = np.round(np.where(np.random.rand(num_entries) < 0.3, 0, reported_income * 0.2))
    capital_gains = np.round(np.where(np.random.rand(num_entries) < 0.4, 0, reported_income * 0.05))
    other_income = np.round(np.where(np.random.rand(num_entries) < 0.25, 0, reported_income * 0.15))
    educational_expenses = np.round(np.where(np.random.rand(num_entries) < 0.2, 0, reported_income * 0.08))
    healthcare_costs = np.round(np.where(np.random.rand(num_entries) < 0.15, 0, reported_income * 0.07))
    lifestyle_expenditure = np.round(np.where(np.random.rand(num_entries) < 0.2, 0, reported_income * 0.12))
    other_expenses = np.round(np.where(np.random.rand(num_entries) < 0.3, 0, reported_income * 0.1))
    bank_debited = np.round(np.where(np.random.rand(num_entries) < 0.1, 0, reported_income * 0.05))
    credit_card_debited = np.round(np.where(np.random.rand(num_entries) < 0.05, 0, reported_income * 0.03))

    # Introduce NaN values individually in each column
    nan_percentage = 0.005
    for feature in [
        interest_income, business_income, capital_gains, other_income,
        educational_expenses, healthcare_costs, lifestyle_expenditure,
        other_expenses, bank_debited, credit_card_debited
    ]:
        nan_count = int(num_entries * nan_percentage)
        nan_indices = np.random.choice(num_entries, nan_count, replace=False)
        feature[nan_indices] = np.nan

    # Add outliers
    outliers_percentage = 0.005
    outliers_count = int(num_entries * outliers_percentage)
    outlier_indices = np.random.choice(num_entries, outliers_count, replace=False)
    for idx in outlier_indices:
        # Introduce outliers in reported income
        reported_income[idx] *= np.random.choice([2, 3, 4, 5])

    # Calculate Actual Income based on the sum of financial features
    actual_income = (
        reported_income +
        interest_income +
        business_income +
        capital_gains +
        other_income -
        educational_expenses -
        healthcare_costs -
        lifestyle_expenditure -
        other_expenses -
        bank_debited -
        credit_card_debited
    )

    # Create DataFrame
    dataset = pd.DataFrame({
        "Name": names,
        "Age": ages,
        "Occupation": occupations,
        "PAN_Card": pan_card_numbers,
        "Aadhar_Card": aadhar_card_numbers,
        "Bank_Account_No": bank_account_numbers,
        "Marital_Status": marital_statuses,
        "Children (Yes/No)": children,
        "Reported_Income": reported_income,
        "Interest_Income": interest_income,
        "Business_Income": business_income,
        "Capital_Gains": capital_gains,
        "Other_Income": other_income,
        "Educational_Expenses": educational_expenses,
        "Healthcare_Costs": healthcare_costs,
        "Lifestyle_Expenditure": lifestyle_expenditure,
        "Other_Expenses": other_expenses,
        "Bank_Debited": bank_debited,
        "Credit_Card_Debited": credit_card_debited,
        "Actual_Income": actual_income
    })

    return dataset

# Generate a diverse dataset with 10,000 entries
diverse_dataset = generate_diverse_dataset(10000)

# Display the first few rows of the diverse dataset
print(diverse_dataset.head())


       Name  Age     Occupation  PAN_Card Aadhar_Card Bank_Account_No  \
0  Name0001   56       Salaried  PAN0001X  Aadhar0001     BankAcc0001   
1  Name0002   23       Business  PAN0002X  Aadhar0002     BankAcc0002   
2  Name0003   39  Self-employed  PAN0003X  Aadhar0003     BankAcc0003   
3  Name0004   70       Business  PAN0004X  Aadhar0004     BankAcc0004   
4  Name0005   58  Self-employed  PAN0005X  Aadhar0005     BankAcc0005   

  Marital_Status Children (Yes/No)  Reported_Income  Interest_Income  \
0         Single               Yes          1595865              0.0   
1        Married                No          1837136         183714.0   
2         Single                No          1448397         144840.0   
3         Single                No           896310          89631.0   
4        Married               Yes           837210          83721.0   

   Business_Income  Capital_Gains  Other_Income  Educational_Expenses  \
0         319173.0        79793.0      239380.0        

In [8]:
diverse_dataset.to_csv("income_regression_dataset3.csv")