In [None]:
from utils.simulation.sim_utils import RandomSimulator
from utils.core.save_manager import SaveUtils
from utils.data_io import load_data
import os
from utils.viz.general_viz import Visualisation
from utils.eda.correlation import CorrelationAnalyzer
import pandas as pd
import numpy as np
from utils.simulation.decomposition import Decomposer
from utils.simulation.covariance_matrix import CovarianceMatrix
from utils.simulation.monte_carlo_simulator import MonteCarloSimulator
from utils.simulation.transformation import DistributionTransformer

üß™ Simulating Unscaled Normally Distributed Data with Specified Skewness and Kurtosis

In [None]:
custom_column_names = [
    "Interest_Rate", "Loan_Amount", "Employment_Length",
    "Age", "Income", "Credit_Score",
    "Debt_To_Income", "Home_Ownership", "Purpose", "Region"
]

simulator = RandomSimulator(num_simulations=1000,column_names=custom_column_names)
df = simulator.simulate_normal(num_variables=10, target_skew=0, target_kurt=3)

save_util = SaveUtils()
save_util.save_dataframe_to_csv(df,os.path.join(os.getcwd(), "data/output/simulated_normal.csv"), overwrite=True)

üß™ Simulate scaled normals (e.g., volatilities)

In [None]:
params = [0.2, 0.15, 0.3]  # std devs or volatilities
simulator = RandomSimulator(parameters=params)
df = simulator.simulate_normal(target_skew=0, target_kurt=3)
print (df)

üìä Simulating Poisson Data from Excel Input
- Reads input data from an Excel file
- Initializes a Poisson simulator
- Simulates Poisson-distributed values
- Saves the simulated data to an Excel file

In [None]:
df_input  = load_data(
    source_type='excel',
    input_path= os.path.join(os.getcwd(), "data/input/Poisson Simulation.xlsx"),
    sheet_name='Lambda Calculation',
    usecols=['Class','Lambda']
)
parameters = df_input["Lambda"].values
column_names = [f"Class_{int(c)}" for c in df_input["Class"].values]

simulator = RandomSimulator(parameters=parameters, column_names=column_names,num_simulations = 10000)
sim = simulator.simulate_poisson()
df_poisson = sim.round().astype(int)
save_util.save_dataframe_to_excel(
    df_poisson,
    os.path.join(os.getcwd(), "data/output/simulated_poisson.xlsx"),
    sheet_name="simulated values", 
    overwrite=True
    )


#### üîó 2.5 Examine Variable Correlations

This section calculates and displays correlations between different types of variables in the `main_df`.

- **`num_method` (str)**: Defines the method for calculating correlation between numerical variables. Allowed values are:
    - `'pearson'` *(default)*: Standard Pearson linear correlation coefficient.
    - `'spearman'`: Spearman's rank correlation coefficient (for monotonic relationships).
    - `'kendall'`: Kendall's tau correlation coefficient (for ordinal or non-normally distributed data).

- **`cat_method` (str)**: Defines the method for calculating association between categorical variables. Allowed values are:
    - `'cramers_v'` *(default)*: Cramer's V (measures association between nominal categorical variables).
    - `'mutual_info'`: Mutual Information (measures the statistical dependence between two random variables).

- **`cat_num_method` (str)**: Defines the method for calculating association between categorical and numerical variables. Allowed values are:
    - `'correlation_ratio'` *(default)*: Correlation Ratio (Eta squared, measures variance explained).
    - `'f_test'`: F-statistic from ANOVA (assesses the difference in means across categories).
    - `'mutual_info'`: Mutual Information (measures the statistical dependence). 
    - `'kruskal'`: Non-parametric alternative to ANOVA. Compares distributions of a continuous variable across categories. Good when your numerical variables are not normally distributed
    - `'target_spearman'`: Replaces each category with the mean of the target variable (e.g. default rate). Then computes correlation with numerical features. Captures ordinal structure or monotonic trends across groups

In [None]:
main_df = load_data(
    source_type='csv',
    input_path= os.path.join(os.getcwd(), "data/input/returns_raw.csv"),
)

In [None]:
analyzer = CorrelationAnalyzer(main_df)
corr_df,corr_matrix = analyzer.correlation_matrix(num_method="pearson", cat_method="cramers_v",
                                      cat_num_method="correlation_ratio",return_matrix=True)

Visualisation.plot_heatmap_matrix(corr_matrix, title="Correlation Matrix")

CovarianceMatrix
----------------
A utility class for calculating the covariance matrix of asset returns,
optionally annualized. Intended for use in financial simulations such as
Monte Carlo modeling.

In [None]:
cov_calc = CovarianceMatrix(main_df)
cov_matrix = cov_calc.get_matrix()
# print(cov_matrix)

#### Cholesky Decomposition and Visualisation

We perform a Cholesky decomposition on the correlation matrix to obtain a lower triangular matrix.  
This decomposition is useful for simulations and generating correlated random variables.  
We then visualise the resulting matrix using a heatmap for better interpretation.

In [None]:
Decomposition_df = Decomposer.cholesky_decomposition(cov_matrix)
# Visualisation.plot_heatmap_matrix(Decomposition_df, title="Cholesky Decomposition Matrix")


##### üìä Monte Carlo Simulation Using Rubinstein's Approach

In [None]:
# main_df = load_data(
#     source_type='csv',
#     input_path= os.path.join(os.getcwd(), "data/input/Monte Carlo Multivariable.csv"),
# )
main_df = load_data(
    source_type='csv',
    input_path= os.path.join(os.getcwd(), "data/input/Monte Carlo Univariable.csv"),
)

sim = MonteCarloSimulator(main_df,num_simulations=10000)
sim.run_simulation()
multivariate_MC_simulation = sim.get_final_simulated_values()
# covariance_matrix = sim.get_covariance_matrix()
raw_normal_simulation = sim.get_raw_simulations()
# cholesky_matrix = sim.get_cholesky_matrix()

save_util.save_dataframe_to_csv(
    multivariate_MC_simulation,
    os.path.join(os.getcwd(), "data/output/MonteCarlo final_sim_u.csv"),
    overwrite=True
    )

save_util.save_dataframe_to_csv(
    raw_normal_simulation,
    os.path.join(os.getcwd(), "data/output/MonteCarlo random_sim_u.csv"),
    overwrite=True
    )

# save_util.save_dataframe_to_csv(
#     covariance_matrix,
#     os.path.join(os.getcwd(), "data/output/MonteCarlo cov_matrix.csv"),
#     overwrite=True
#     )


# save_util.save_dataframe_to_csv(
#     cholesky_matrix,
#     os.path.join(os.getcwd(), "data/output/MonteCarlo cholesky_matrix.csv"),
#     overwrite=True
#     )

Reading From a Json Source

In [None]:
from utils.data_io import load_data
api_data = '[["name", "age"], ["Alice", 30], ["Bob", 25]]'
df = load_data(source_type='json', json_source=api_data)
print(df)

Fitting **Beta** distributions and then simulating random numbers based on the result

In [None]:
from utils.eda.statistical import StatisticalAnalysis
from utils.simulation.sim_utils import RandomSimulator
import pandas as pd
import numpy as np


# Step 1: Create sample dataframe with random numbers between 0 and 1
np.random.seed(42)  # for reproducibility
df = pd.DataFrame({
    'default_rate': np.random.rand(100)
})


# Step 2: Initialize StatisticalAnalysis
stat = StatisticalAnalysis(df)

# Step 3: Fit only 'beta' and 'logit-normal'
distribution_results = stat.fit_best_distribution(
    ['default_rate'],
    method='sumsquare_error',
    common_distributions=False,
    distribution_list=['beta'],  # assuming 'logitnorm' works with Fitter
    timeout=300
)

print("\nReturned Results:", distribution_results)


params = distribution_results['default_rate']['parameters']
simulator = RandomSimulator(
    parameters=[(params['a'], params['b'], params['loc'], params['scale'])],
    num_simulations=10000,
    column_names=['default_rate_sim']
)
df_sim = simulator.simulate_beta()
print(df_sim.head())

A sample for LDA report 
How to use log-normal and gamma simulations in action

In [None]:
from utils.eda.statistical import StatisticalAnalysis
from utils.simulation.sim_utils import RandomSimulator
import pandas as pd
import numpy as np

# Step 1: Create sample dataframe
np.random.seed(42)
df = pd.DataFrame({
    'default_rate': np.random.randint(100, 1001, size=100)  # 100 integers from 100 to 1000 inclusive
})

# Step 2: Initialize StatisticalAnalysis
stat = StatisticalAnalysis(df)

# Step 3: Fit only 'logit-normal' and 'gamma'
distribution_results = stat.fit_best_distribution(
    ['default_rate'],
    method='sumsquare_error',
    common_distributions=False,
    distribution_list=['lognorm', 'gamma'],
    timeout=300
)

print("\nReturned Results:", distribution_results)

# Step 4: Loop through results and simulate using the winning distribution
for col, res in distribution_results.items():
    best_dist = res['best_distribution']   # e.g., 'lognorm' or 'gamma'
    params = res['parameters']

    simulator = RandomSimulator(
        parameters=[tuple(params.values())],  # convert dict to tuple
        num_simulations=10000,
        column_names=[f"{col}_sim"]
    )

    if best_dist == 'lognorm':
        ordered_params = (params['s'], params['loc'], params['scale'])
        simulator = RandomSimulator(
            parameters=[ordered_params],
            num_simulations=10000,
            column_names=[f"{col}_sim"]
        )
        df_sim = simulator.simulate_lognormal()
    elif best_dist == 'gamma':
        ordered_params = (params['a'], params['loc'], params['scale'])
        simulator = RandomSimulator(
            parameters=[ordered_params],
            num_simulations=10000,
            column_names=[f"{col}_sim"]
        )
        df_sim = simulator.simulate_gamma()
    else:
        raise ValueError(f"Unsupported distribution: {best_dist}")

    print(f"\nSimulation for {col} ({best_dist}):")
    print(df_sim.head())

### Gaussian Copula with Monte Carlo Simulation Sample
- 1.simulate random initial values with given distributions
- 2.fit best given distributions on them
- 3.calculate CDF to map them between (0,1)
- 4.convert them to standard normal
- 5.calculate correlation
- 6.calculate cholesky
- 7.Simulating Random numbers from standard normal distribution
- 8.Multiplying simulated data by Cholesky matrix
- 9.Calculating Normal CDF
- 10.converting the result to their real distribution

##### 1.simulate random initial values with given distributions


In [None]:
# Generate random numbers

variables = {
    "A1": {"dist": "lognormal", "mean": 1500000},
    "A2": {"dist": "lognormal", "mean": 900000},
    "A3": {"dist": "normal", "mean": 600000},
    "A4": {"dist": "normal", "mean": 3000000},
    "A5": {"dist": "gamma", "mean": 5000000},
    "A6": {"dist": "gamma", "mean": 200000},
    "A7": {"dist": "lognormal", "mean": 300000},
    "A8": {"dist": "gamma", "mean": 1700000},
    "A9": {"dist": "normal", "mean": 2800000},
    "A10": {"dist": "lognormal", "mean": 500000},
}

def build_parameters(dist, mean):
    if dist == "normal":
        std = 0.2 * mean
        return np.array([[mean, std]])  # (mean, std)
    elif dist == "lognormal":
        s = 0.4
        loc = 0
        scale = mean / 1.2
        return np.array([[s, loc, scale]])
    elif dist == "gamma":
        shape = 2
        loc = 0
        scale = mean / shape
        return np.array([[shape, loc, scale]])
    else:
        raise ValueError(f"Unsupported distribution type: {dist}")

results = {}

for var_name, specs in variables.items():
    dist = specs["dist"]
    mean = specs["mean"]

    params = build_parameters(dist, mean)

    sim = RandomSimulator(parameters=params, num_simulations=186, column_names=[var_name], decorrelate=False)

    if dist == "normal":
        df = sim.simulate_normal()
    elif dist == "lognormal":
        df = sim.simulate_lognormal()
    elif dist == "gamma":
        df = sim.simulate_gamma()
    else:
        raise ValueError(f"Unsupported distribution type: {dist}")

    results[var_name] = df[var_name]

final_df = pd.concat(results, axis=1)

print(final_df.head())

save_util.save_dataframe_to_excel(
    final_df,
    os.path.join(os.getcwd(), "data/output/LAR_simulated_Default.xlsx"),
    sheet_name="simulated values", 
    overwrite=True
    )


##### 2.fit best given distributions
We only use these distributions: 'norm, 'lognorm' , 'gamma', 'expon', 'weibull_min', 'beta','t','pareto'

In [None]:
# Step 1: Read data
main_df = load_data(
    source_type='excel',
    input_path= os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name = 'historical values'
)

# Step 2: Read features
features = main_df.columns.to_list()

# Step 3: Initialize StatisticalAnalysis
stat = StatisticalAnalysis(main_df)

# Step 4: Fit only given distributions
distribution_results = stat.fit_best_distribution(
    features,
    method='sumsquare_error',
    common_distributions=False,
    distribution_list=['norm', 'lognorm' , 'gamma', 'expon', 'weibull_min', 'beta','t','pareto'],
    timeout=300
)

print("\nReturned Results:", distribution_results)

# Step 5: Format and save the distribution results in a file
formatted = {}

for var, info in distribution_results.items():
    params = info['parameters']
    formatted[var] = {'distribution': info['best_distribution'], **params}

distribution_df = pd.DataFrame(formatted)

# Print neatly
print(distribution_df)

save_util.save_dataframe_to_excel(
    distribution_df,
    os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name="Features Distributions", 
    overwrite=False,
    index=True
    )

##### 3.calculate CDF to map them between (0,1)

In [None]:
stat = StatisticalAnalysis(main_df)
cdf_df = stat.compute_cdf(distribution_results)

print(cdf_df.head())

save_util.save_dataframe_to_excel(
    cdf_df,
    os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name="cdf", 
    overwrite=False,
    index=True
    )

##### 4.convert CDF to Standard Normal

In [None]:
from utils.simulation.transformation import DistributionTransformer

# Convert CDFs to standard normal space
z_df = DistributionTransformer.to_standard_normal(cdf_df)

save_util.save_dataframe_to_excel(
    z_df,
    os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name="Z normal", 
    overwrite=False,
    index=True
    )

##### 5.calculate correlation
Using Correlation Matrix with Pearson method

In [None]:
analyzer = CorrelationAnalyzer(z_df)
z_corr_df,z_corr_matrix = analyzer.correlation_matrix(num_method="pearson",return_matrix=True)

Visualisation.plot_heatmap_matrix(z_corr_matrix, title="Correlation Matrix")

z_correlation_pivot = analyzer.show_correlation_pivot(num_method="pearson")
print(z_correlation_pivot)

save_util.save_dataframe_to_excel(
    z_correlation_pivot,
    os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name="Correlation", 
    overwrite=False,
    index=True
    )

##### 6.Calculate Cholesky Decomposition

In [None]:
z_Cholesky_df = Decomposer.cholesky_decomposition(z_correlation_pivot)

print(z_Cholesky_df)

save_util.save_dataframe_to_excel(
    z_Cholesky_df,
    os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name="Cholesky", 
    overwrite=False,
    index=True
    )

##### 7.Simulating Random numbers from standard normal distribution


In [None]:
# Step 1: Extract feature names from main_df
feature_names = main_df.columns.tolist()

# Step 2: Initialize the RandomSimulator class
RandomNormalSimulator = RandomSimulator(
    num_simulations=10000,
    column_names=feature_names,
    decorrelate=True
)

# Step 3: Generate uncorrelated normal distributions for all features
raw_simulated_data = RandomNormalSimulator.simulate_normal(
    num_variables=len(feature_names),
    loc=0,
    scale=1,
    target_skew=0,
    target_kurt=3
)

save_util.save_dataframe_to_excel(
    raw_simulated_data,
    os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name="raw_simulations", 
    overwrite=False,
    index=True
    )

print(raw_simulated_data.head())

##### 8.Multiplying simulated data by Cholesky matrix to create correlation between them

In [None]:
# Step 1: Ensure column order alignment
cholesky_matrix = z_Cholesky_df.loc[main_df.columns, main_df.columns]  # reorder to match features
simulated_data = raw_simulated_data[main_df.columns]  # align simulated columns

# Step 2: Matrix multiplication (introduce correlations)
correlated_data = simulated_data.to_numpy() @ cholesky_matrix.to_numpy().T

# Step 3: Convert back to DataFrame with feature names
correlated_df = pd.DataFrame(correlated_data, columns=main_df.columns)

# Step 4: Display first few rows
print(correlated_df.head())

save_util.save_dataframe_to_excel(
    correlated_df,
    os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name="Correlated_simulations", 
    overwrite=False,
    index=True
    )

##### 9.Calculating Normal CDF

In [None]:
# Step 1: Since our simulated variables are already perfectly normal (from Cholesky-transformed standard normals),
#         there is no need to re-fit any distributions. We can safely assume each feature follows N(0, 1).

fitted_results = {
    feature: {
        "best_distribution": "norm",
        "parameters": {"loc": 0, "scale": 1}
    }
    for feature in correlated_df.columns
}

# Step 2: Compute the cumulative distribution function (CDF) values
#         for each feature based on the standard normal distribution.

cdf_simulated_df = StatisticalAnalysis(correlated_df).compute_cdf(fitted_results)

# Step 3: Display a sample of the computed CDF values
print(cdf_simulated_df.head())

save_util.save_dataframe_to_excel(
    cdf_simulated_df,
    os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name="cdf_simulated_data", 
    overwrite=False,
    index=True
    )


##### 10.converting the result to their real distribution

In [None]:
# Step 1: Convert 'distribution_results' to match the expected input format of from_standard_normal_to_real()
fitted_params = {
    feature: {
        "distribution": info["best_distribution"],
        **info["parameters"]
    }
    for feature, info in distribution_results.items()
}

# Step 2: Convert correlated standard normal data back to real-world scale

real_values_df = DistributionTransformer.from_standard_normal_to_real(
    z_df=cdf_simulated_df,        # The correlated data in CDF/Œ¶‚Åª¬π space
    fitted_params=fitted_params   # Original fitted distribution parameters
)

# Step 3: Display result ---
print(real_values_df.head())


save_util.save_dataframe_to_excel(
    real_values_df,
    os.path.join(os.getcwd(), "data/input/LAR_simulated_Default.xlsx"),
    sheet_name="real_values", 
    overwrite=False,
    index=True
    )  