In [11]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Generate synthetic data
num_records = 5001

customer_ids = np.arange(101, 101 + num_records)
ages = np.random.randint(20, 65, num_records)
genders = np.random.choice(['Male', 'Female'], num_records)
employment_statuses = np.random.choice(['Employed', 'Student', 'Unemployed'], num_records)
incomes = np.random.randint(20000, 100000, num_records)
credit_histories = np.random.choice(['Excellent', 'Good', 'Limited', 'Poor'], num_records)
loan_ids = np.arange(201, 201 + num_records)
loan_amounts = np.random.randint(5000, 30000, num_records)
interest_rates = np.around(np.random.uniform(4, 10, num_records), decimals=1)
loan_terms = np.random.choice(['24 months', '36 months', '48 months'], num_records)
payment_histories = np.random.choice(['On-time', 'Missed', 'Defaulted'], num_records)
defaults = np.random.choice(['Yes', 'No'], num_records)

# Add Year column
years = np.random.choice([2020, 2021, 2022], num_records)

# Calculate DefaultRate based on total defaults in each year
default_rates = np.zeros(num_records)
for year in [2020, 2021, 2022]:
    mask = years == year
    total_records = np.sum(mask)
    total_defaults = np.sum(defaults[mask] == 'Yes')
    default_rate = total_defaults / total_records
    default_rates[mask] = default_rate

# Create DataFrame
data = {
    'CustomerID': customer_ids,
    'Age': ages,
    'Gender': genders,
    'EmploymentStatus': employment_statuses,
    'Income': incomes,
    'CreditHistory': credit_histories,
    'LoanID': loan_ids,
    'LoanAmount': loan_amounts,
    'InterestRate': interest_rates,
    'LoanTerm': loan_terms,
    'PaymentHistory': payment_histories,
    'Default': defaults,
    'Year': years,
    'DefaultRate': default_rates
}

df = pd.DataFrame(data)

# Save DataFrame as a CSV file
csv_filename = 'credit_transaction_data.csv'
df.to_csv(csv_filename, index=False)

print(f'Dataset saved as {csv_filename}')


Dataset saved as credit_transaction_data.csv
