In [1]:
import pickle
import os
from src import config
from src import blob

In [2]:
if not os.path.exists("./data/bmk2018.pkl"):
    data = blob.load_csv_from_azure_storage(blob_name = 'BMK_2018.csv')
    
    if not os.path.exists("./data"):
        os.mkdir("./data")
    with open('./data/bmk2018.pkl', 'wb') as pkl:
        pickle.dump(data, pkl, protocol=pickle.HIGHEST_PROTOCOL)
else: 
    try:
        with open('./data/bmk2018.pkl', 'rb') as pkl:
            data = pickle.load(pkl)
    except EOFError:
        print("Error: End of file reached unexpectedly. Check for file corruption or empty file.")
    except FileNotFoundError:
        print("Error: File not found. Verify the file path.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    

### Only use selected columns

This will consider FormActualKey as primary key

In [3]:
real_data = data[[
    "ESI_Key",
    "FormActualKey",
    "BMK_S01_Strategic",
    "BMK_S02_Quick",
    "BMK_S03_Decisive",
    "BMK_S04_Change",
    "BMK_S05_Leading",
    "BMK_S06_Confront",
    "BMK_S07_Participative",
    "BMK_S08_Build",
    "BMK_S09_Compassion",
    "BMK_S10_Putting",
    "BMK_S11_Respect",
    "BMK_S12_Taking",
    "BMK_S13_Composure",
    "BMK_S14_Balance",
    "BMK_S15_SelfAware",
    "BMK_S16_Career",
    "BMK_D01_Interper",
    "BMK_D02_DiffBuild",
    "BMK_D03_DiffChange",
    "BMK_D04_Failure",
    "BMK_D05_Narrow"]]
real_data

Unnamed: 0,ESI_Key,FormActualKey,BMK_S01_Strategic,BMK_S02_Quick,BMK_S03_Decisive,BMK_S04_Change,BMK_S05_Leading,BMK_S06_Confront,BMK_S07_Participative,BMK_S08_Build,...,BMK_S12_Taking,BMK_S13_Composure,BMK_S14_Balance,BMK_S15_SelfAware,BMK_S16_Career,BMK_D01_Interper,BMK_D02_DiffBuild,BMK_D03_DiffChange,BMK_D04_Failure,BMK_D05_Narrow
0,4147942,231390,4.375000,4.333333,4.000000,4.555556,4.307692,4.833333,4.222222,3.857143,...,5.00,4.00,2.666667,4.00,4.285714,1.875,1.571429,1.8,1.666667,1.800000
1,7365679,13135095,4.125000,4.666667,4.000000,3.555556,3.923077,3.500000,3.555556,3.714286,...,3.80,4.25,4.000000,4.50,3.571429,1.750,1.571429,1.4,1.333333,1.600000
2,7365689,13135152,3.625000,3.333333,3.666667,3.555556,4.076923,3.000000,3.777778,3.166667,...,4.25,3.25,4.000000,3.75,3.428571,2.250,2.428571,2.2,3.500000,1.666667
3,7365676,13135216,5.000000,5.000000,5.000000,4.777778,4.846154,4.666667,5.000000,5.000000,...,5.00,4.75,3.666667,4.75,5.000000,1.000,1.000000,1.0,1.166667,1.600000
4,7365673,13135229,3.375000,3.666667,3.666667,4.000000,3.538462,2.500000,4.111111,4.142857,...,3.60,4.00,3.000000,3.75,2.571429,1.000,1.571429,1.4,1.333333,1.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86518,7667147,15190209,4.714286,4.500000,4.666667,4.777778,4.846154,5.000000,4.888889,4.857143,...,5.00,5.00,5.000000,4.75,4.857143,1.000,1.000000,1.0,1.000000,1.000000
86519,7667147,15190210,4.000000,,4.000000,,4.100000,4.000000,4.666667,4.250000,...,,4.25,5.000000,5.00,,1.000,1.142857,1.0,1.000000,1.000000
86520,7667147,15190211,3.750000,4.000000,3.333333,3.666667,3.461538,3.666667,3.777778,4.142857,...,3.80,4.25,4.666667,4.00,4.571429,1.250,2.142857,1.8,2.166667,2.000000
86521,7667147,15190213,4.250000,4.333333,4.333333,3.888889,4.153846,3.500000,3.666667,4.428571,...,3.80,4.25,5.000000,4.00,4.000000,1.000,1.285714,1.0,1.000000,1.600000


#### Identify distribution of columns

In [4]:
from src.distribution import test_col_distributions

results = test_col_distributions(data=real_data.iloc[:,3:5])
results

AttributeError: module 'scipy.stats.distributions' has no attribute 'Gaussian'

#### Create metadata

In [None]:
from sdv.metadata import SingleTableMetadata

In [None]:
metadata = SingleTableMetadata()

In [None]:
metadata.detect_from_dataframe(real_data)
data_dict = metadata.to_dict()
data_dict

In [None]:
metadata.validate_data(data=real_data)

### 1. Train synthesizer
##### Using Gaussian-Copula Synthesizer

In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

In [None]:
## step 1: create the synthesizer
synthesizer = GaussianCopulaSynthesizer(
    metadata, 
    enforce_min_max_values=True,
    enforcing_rounding=False,
    numerical_distributions={
        
    },
    default_distribution='norm'
)

## step 2: Train the synthesizer
synthesizer.fit(real_data)

In [None]:
## step 3: Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=100)

In [None]:
synthetic_data

### 2. Train Synthesizer
#### Using CTGAN

In [None]:
from sdv.single_table import CTGANSynthesizer

In [None]:
import torch
torch.cuda.is_available()

In [None]:
if os.path.exists("./data/bmk2018_ctgan.pkl"): 
    synthesizer = CTGANSynthesizer.load(filepath="./data/bmk2018_ctgan.pkl")
    
else: 
    synthesizer = CTGANSynthesizer(metadata, 
                                   epochs=500,
                                   enforce_rounding=True,
                                   enforce_min_max_values=True,
                                   verbose=True,
                                   cuda=True)
    synthesizer.fit(real_data)
    
    synthesizer.save(filepath='./data/bmk2018_ctgan.pkl')
    
    

In [None]:
synthesizer.get_loss_values()

In [None]:
ctgan_data = synthesizer.sample(num_rows=500)

In [None]:
ctgan_data

In [None]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=real_data,
    synthetic_data=ctgan_data,
    metadata=metadata
)

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    ctgan_data,
    metadata
)

In [None]:
quality_report.get_details('Column Shapes')

#### Visualization

In [None]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=data,
    synthetic_data=ctgan_data,
    metadata=metadata,
    plot_type='distplot',
    column_name='BMK_D02_DiffBuild'
)
    
fig.show()

In [None]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=data,
    synthetic_data=ctgan_data,
    metadata=metadata,
    plot_type='bar',
    column_name='BMK_D02_DiffBuild'
)
    
fig.show()