In [None]:
# Make the notebook full screen
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import os
import pandas as pd
import importlib
import json
import sys 

if sys.version_info[:3] < (3,4):
    os.getcdw()
    code_dir = os.path.dirname(os.getcdw())
    project_dir = os.path.dirname(os.path.dirname(os.getcdw()))
    data_path = os.path.join(code_dir, "data")
    functions_path = os.path.join(project_dir, "functions")
else: 
    from pathlib import Path
    current_directory = os.path.dirname(Path.cwd())
    code_dir = os.path.dirname(current_directory)
    project_dir = os.path.join(code_dir, "1_Fake_Data_Generation")
    data_path = os.path.join(code_dir, "1_Fake_Data_Generation\\data")
    functions_path = os.path.join(code_dir, 'functions')
#code_dir = r'D:\BackUp - 151110\Side_Projects\Analytical_Solutions\Sotiris_Solutions\1_Fake_Data_Generation\src'
#project_dir = r'D:\BackUp - 151110\Side_Projects\Analytical_Solutions\Sotiris_Solutions\1_Fake_Data_Generation'
#data_path = r'D:\BackUp - 151110\Side_Projects\Analytical_Solutions\Sotiris_Solutions\1_Fake_Data_Generation\data'
#functions_path = r'D:\BackUp - 151110\Side_Projects\Analytical_Solutions\Sotiris_Solutions\functions'
print(code_dir)
print(project_dir)
print(data_path)
print(functions_path)

In [None]:
# Set the path for the library
import sys
sys.path.insert(0, functions_path)

In [None]:
import fake_data_generator as fake_data

# CUSTOMIZABLE STEP: Initialize the variables

In [None]:
# String. Set the input data
table_name = 'fake_data.csv'

# Produce fake data

In [None]:
df = fake_data.fake_data_generator(
    target_type = 'b', #String. Define whether the target variable should be numeric (provide 'n') or binary (provide any other string). 
    target_1_probability = 0.20, #Float, takes values 0-1 Relevant if the target variable is binary: provide the percentage of the 1s in the. target variable. If the target is numeric, then you can leave this argument blank.
    sample_size = 100000, #Integer. Number of records in the output dataset
    predictors = 100, #Integer. Number of predictors in the output dataset.
    n_informative_predictors = 40, #Integer. Percentage of informative predictors - takes values between 0 and 1
    n_redundant = 15, # Integer. Applicable when target_type='b'. The number of redundant features. These features are generated as random linear combinations of the informative features.
    n_repeated = 5, # Integer. Applicable when target_type='b'. The number of duplicated features, drawn randomly from the informative and the redundant features.
    bias_var = 0.01, #Float. Applicable when target_type='n'. The bias term (constant in the regression equation) in the underlying linear model, 0 means no bias. bias var=0 and noise_var=0 means perfect correlation, e.g. R^2/Gini = 1
    noise_var = 0.01, #Float. Applicable when target_type='n'. The standard deviation of the gaussian noise applied to the output, 0 means no noise applied to the output. 
    flip_y = 0.05, # Float. Applicable when target_type='b'. Noise level (percentage of randomly flipped labels). The fraction of samples whose class is assigned randomly. Larger values introduce noise in the labels and make the classification task harder. 
    class_sep = 0.8, # Float. Applicable when target_type='b'. Class separation (higher = more separable). The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task easier.
    weight_var = 'random', # String. weight_variable information: '1' returns a vector with 1, random returns a vector with random weights. 
    seed = 1 #Integer. set the seed so that the fake dataset will be reproducible.
)

In [None]:
df.shape

In [None]:
df.head()

# Create solution inputs for the numerical and categorical inputs

In [None]:
numeric_candidates = [col for col in df if col.startswith('random_var') or col.startswith('num') or col.startswith('outlier')]
character_candidates = ['cat_3', 'cat_5', 'cat_20', 'cat_200']

In [None]:
print(json.dumps(numeric_candidates))
print(json.dumps(character_candidates))

# Rename features

In [None]:
df.columns = [
#Random_vars
# Credit History & Loan Information
"Credit_score",
"open_credit_accounts_cnt",
"Length_credit_history",
"credit_inquiries_6_months_cnt",
"late_payments_30+_days_cnt",
"late payments_90+_days_cnt",
"credit_accounts_ever_delinquent_cnt",
"charged-off accounts_cnt",
"bankruptcies_cnt",
"foreclosures_cnt",
"settled accounts_cnt",
"current installment loans_cnt",
"revolving credit accounts_cnt",
"accounts in collections_cnt",
"Total outstanding loan balance",
"Total available credit limit",
"Total utilized credit (credit usage %)",
"Number of credit card accounts",
"Number of personal loans",
# Debt & Financial Obligations
"Debt-to-income (DTI) ratio",
"Monthly housing expenses (rent/mortgage)",
"Monthly utility bills",
"Monthly transportation costs",
"Monthly education expenses",
"Monthly healthcare expenses",
"Monthly childcare expenses",
"Monthly discretionary spending",
"Number of dependents in household",
"Total number of outstanding loans",
"Average monthly loan repayment amount",
"Number of overdraft fees in last year",
"Number of bounced checks in last year",
"Credit card balances",
"Maximum credit limit utilization in the last year",
"Loan-to-value (LTV) ratio for mortgages",
"Number of credit cards near limit",
"Number of installment loans closed",
"Number of revolving accounts closed",
"Average credit utilization over the last 12 months",
# Banking & Transactional Behavior
"Number of active bank accounts",
"Number of savings accounts",
"Number of checking accounts",
"Number of bounced checks in the last 12 months",
"Average balance in checking account",
"Average balance in savings account",
"Frequency of ATM withdrawals",
"Total monthly bank deposits",
"Total monthly bank withdrawals",
"Number of direct deposits per month",
"Number of cash deposits per month",
"Number of online transactions per month",
"Number of international transactions",
"Number of wire transfers",
"Frequency of late bill payments",
"Number of auto-debits declined",
"Percentage of salary deposited in a bank",
"Amount deposited in a bank",
"Number of peer-to-peer (P2P) transactions",
"Number of transactions flagged as suspicious",
# Behavioral & Lifestyle Indicators
"Number of times salary was delayed in the last year",
"Number of high-value purchases (e.g. $500+)",
"Monthly spending on luxury goods",
"Number of gambling-related transactions",
"Number of transactions for alcohol/tobacco",
"Number of online subscriptions (Netflix, Spotify, etc.)",
"Number of travel-related purchases per year",
"Number of payday loan applications",
"Number of transactions at pawn shops",
"Number of revolving credit accounts",
"Number of unpaid parking tickets or fines",
"Frequency of ATM withdrawals in different cities",
"Percentage of income spent on dining out",
"Number of gym or club memberships",
"Number of current installment loans",
"Number of loan refinancing applications",
"Number of legal disputes (divorce, lawsuits)",
"Number of late tax filings",
"Number of vehicle repossessions",
"Number of luxury car lease applications",
# Macroeconomic & External Factors
"Local unemployment rate",
"Inflation rate at the time of loan application",
"Stock market performance",
"Housing market trend in applicants area",
"Average credit risk of applicants region",
"Industry-specific job stability index",
"National economic growth rate (GDP)",
"Average interest rate at the time of loan application",
"Local crime rate in applicants area",
"Average salary growth trend in applicants industry",
"Bank lending policy changes",
"Number of layoffs in applicants industry",
"Number of open credit accounts ever",
"Risk classification of applicants employer",
"Political stability index of applicants country",
"Average credit delinquency rate in applicants city",
"Cost of living index in applicants region",
# Personal & Demographic Information
"dependents_cnt",
"Years_current_job",
"Home_value",
"Years_current_residence",
################################################################################
# Cat_vars
"Residence_type",
"Marital_status",
"Occupation",
"ZIP_code",
################################################################################
# Num missing
"Years_current_industry",
"Company bankruptcy risk (if self-employed)",
"Number of natural disasters in the last year (for affected regions)",
"Country credit rating (if international applicant)",
################################################################################
# Outlier vars
"Monthly_income",
"Annual_income", 
################################################################################
# Remaining vars
'weight_variable', 
'sample variable', 
'target', 
'amount']

In [None]:
df.columns

In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)  # Set max rows to display None (all rows)
pd.DataFrame(df.columns)

# Export dataset to csv file

In [None]:
df.to_csv(os.path.join(data_path, 'output/', table_name), sep=',', index=False)