In [1]:
import pickle
import os
from src import config
from src import blob

In [2]:
if not os.path.exists("./data/bmk2018.pkl"):
    data = blob.load_csv_from_azure_storage(blob_name = 'BMK_2018.csv')
    
    if not os.path.exists("./data"):
        os.mkdir("./data")
    with open('./data/bmk2018.pkl', 'wb') as pkl:
        pickle.dump(data, pkl, protocol=pickle.HIGHEST_PROTOCOL)
else: 
    try:
        with open('./data/bmk2018.pkl', 'rb') as pkl:
            data = pickle.load(pkl)
    except EOFError:
        print("Error: End of file reached unexpectedly. Check for file corruption or empty file.")
    except FileNotFoundError:
        print("Error: File not found. Verify the file path.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    

### Only use selected columns

This will consider FormActualKey as primary key

In [3]:
data = data[[
    "ESI_Key",
    "FormActualKey",
    "BMK_S01_Strategic",
    "BMK_S02_Quick",
    "BMK_S03_Decisive",
    "BMK_S04_Change",
    "BMK_S05_Leading",
    "BMK_S06_Confront",
    "BMK_S07_Participative",
    "BMK_S08_Build",
    "BMK_S09_Compassion",
    "BMK_S10_Putting",
    "BMK_S11_Respect",
    "BMK_S12_Taking",
    "BMK_S13_Composure",
    "BMK_S14_Balance",
    "BMK_S15_SelfAware",
    "BMK_S16_Career",
    "BMK_D01_Interper",
    "BMK_D02_DiffBuild",
    "BMK_D03_DiffChange",
    "BMK_D04_Failure",
    "BMK_D05_Narrow"]]
data

Unnamed: 0,ESI_Key,FormActualKey,BMK_S01_Strategic,BMK_S02_Quick,BMK_S03_Decisive,BMK_S04_Change,BMK_S05_Leading,BMK_S06_Confront,BMK_S07_Participative,BMK_S08_Build,...,BMK_S12_Taking,BMK_S13_Composure,BMK_S14_Balance,BMK_S15_SelfAware,BMK_S16_Career,BMK_D01_Interper,BMK_D02_DiffBuild,BMK_D03_DiffChange,BMK_D04_Failure,BMK_D05_Narrow
0,4147942,231390,4.375000,4.333333,4.000000,4.555556,4.307692,4.833333,4.222222,3.857143,...,5.00,4.00,2.666667,4.00,4.285714,1.875,1.571429,1.8,1.666667,1.800000
1,7365679,13135095,4.125000,4.666667,4.000000,3.555556,3.923077,3.500000,3.555556,3.714286,...,3.80,4.25,4.000000,4.50,3.571429,1.750,1.571429,1.4,1.333333,1.600000
2,7365689,13135152,3.625000,3.333333,3.666667,3.555556,4.076923,3.000000,3.777778,3.166667,...,4.25,3.25,4.000000,3.75,3.428571,2.250,2.428571,2.2,3.500000,1.666667
3,7365676,13135216,5.000000,5.000000,5.000000,4.777778,4.846154,4.666667,5.000000,5.000000,...,5.00,4.75,3.666667,4.75,5.000000,1.000,1.000000,1.0,1.166667,1.600000
4,7365673,13135229,3.375000,3.666667,3.666667,4.000000,3.538462,2.500000,4.111111,4.142857,...,3.60,4.00,3.000000,3.75,2.571429,1.000,1.571429,1.4,1.333333,1.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86518,7667147,15190209,4.714286,4.500000,4.666667,4.777778,4.846154,5.000000,4.888889,4.857143,...,5.00,5.00,5.000000,4.75,4.857143,1.000,1.000000,1.0,1.000000,1.000000
86519,7667147,15190210,4.000000,,4.000000,,4.100000,4.000000,4.666667,4.250000,...,,4.25,5.000000,5.00,,1.000,1.142857,1.0,1.000000,1.000000
86520,7667147,15190211,3.750000,4.000000,3.333333,3.666667,3.461538,3.666667,3.777778,4.142857,...,3.80,4.25,4.666667,4.00,4.571429,1.250,2.142857,1.8,2.166667,2.000000
86521,7667147,15190213,4.250000,4.333333,4.333333,3.888889,4.153846,3.500000,3.666667,4.428571,...,3.80,4.25,5.000000,4.00,4.000000,1.000,1.285714,1.0,1.000000,1.600000


#### Create metadata

In [4]:
from sdv.metadata import SingleTableMetadata

In [5]:
metadata = SingleTableMetadata()

In [6]:
metadata.detect_from_dataframe(data)
data_dict = metadata.to_dict()
data_dict

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'ESI_Key': {'sdtype': 'numerical'},
  'FormActualKey': {'sdtype': 'id'},
  'BMK_S01_Strategic': {'sdtype': 'numerical'},
  'BMK_S02_Quick': {'sdtype': 'numerical'},
  'BMK_S03_Decisive': {'sdtype': 'numerical'},
  'BMK_S04_Change': {'sdtype': 'numerical'},
  'BMK_S05_Leading': {'sdtype': 'numerical'},
  'BMK_S06_Confront': {'sdtype': 'numerical'},
  'BMK_S07_Participative': {'sdtype': 'numerical'},
  'BMK_S08_Build': {'sdtype': 'numerical'},
  'BMK_S09_Compassion': {'sdtype': 'numerical'},
  'BMK_S10_Putting': {'sdtype': 'numerical'},
  'BMK_S11_Respect': {'sdtype': 'numerical'},
  'BMK_S12_Taking': {'sdtype': 'numerical'},
  'BMK_S13_Composure': {'sdtype': 'numerical'},
  'BMK_S14_Balance': {'sdtype': 'numerical'},
  'BMK_S15_SelfAware': {'sdtype': 'numerical'},
  'BMK_S16_Career': {'sdtype': 'numerical'},
  'BMK_D01_Interper': {'sdtype': 'numerical'},
  'BMK_D02_DiffBuild': {'sdtype': 'numerical'},
  'BMK_D03_DiffChange': {'sd

In [7]:
metadata.validate_data(data=data)

### 1. Train synthesizer
##### Using Gaussian-Copula Synthesizer

In [8]:
from sdv.single_table import GaussianCopulaSynthesizer

In [9]:
## step 1: create the synthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)

## step 2: Train the synthesizer
synthesizer.fit(data)

In [10]:
## step 3: Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=100)

In [11]:
synthetic_data

Unnamed: 0,ESI_Key,FormActualKey,BMK_S01_Strategic,BMK_S02_Quick,BMK_S03_Decisive,BMK_S04_Change,BMK_S05_Leading,BMK_S06_Confront,BMK_S07_Participative,BMK_S08_Build,...,BMK_S12_Taking,BMK_S13_Composure,BMK_S14_Balance,BMK_S15_SelfAware,BMK_S16_Career,BMK_D01_Interper,BMK_D02_DiffBuild,BMK_D03_DiffChange,BMK_D04_Failure,BMK_D05_Narrow
0,4374574,0,1.029331,1.039974,1.039974,4.603170,4.599143,4.636597,4.889388,1.029331,...,1.031663,4.900231,1.039974,1.049902,4.782678,1.121117,1.001666,1.240840,1.097566,1.201790
1,7669710,1,1.029331,1.039974,1.039974,3.677680,3.065344,3.303015,4.204428,1.029331,...,1.031663,3.054364,1.039974,1.049902,3.544342,2.079154,,4.244909,5.000000,5.000000
2,7669710,2,1.029331,1.039974,1.039974,3.648098,3.700152,4.698304,4.335813,1.029331,...,1.031663,4.720456,1.039974,1.049902,4.207253,1.237126,2.434837,1.452103,2.829142,1.240697
3,7669710,3,1.029331,1.039974,1.039974,3.412347,3.709853,2.934645,4.087325,1.029331,...,1.031663,4.396786,,1.049902,4.014887,1.104933,1.001053,1.132675,1.440376,4.576034
4,4374574,4,1.029331,1.039974,1.039974,4.661862,4.206557,4.165276,4.478656,1.029331,...,1.031663,4.317007,1.039974,,4.401503,1.034386,1.009928,1.141318,1.188633,1.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4374574,95,1.029331,1.039974,1.039974,4.186289,4.656971,,3.678430,,...,1.031663,2.904952,1.039974,1.049902,3.198420,1.306258,1.000067,1.926271,1.515531,1.494775
96,7669710,96,1.029331,1.039974,1.039974,4.318706,4.705421,4.191163,4.937016,1.029331,...,1.031663,4.993496,1.039974,1.049902,4.640826,1.036109,1.000002,1.108388,1.024060,1.218147
97,7669710,97,1.029331,1.039974,1.039974,4.500184,4.101040,3.399947,4.855174,1.029331,...,1.031663,4.849977,,1.049902,,1.037737,1.007078,1.177480,1.176611,1.625731
98,4374574,98,1.029331,1.039974,1.039974,3.791567,4.175934,3.610912,3.831439,1.029331,...,1.031663,3.949713,,1.049902,3.292267,2.060220,1.488827,2.276461,3.178869,2.211884


### 2. Train Synthesizer
#### Using CTGAN

In [13]:
from sdv.single_table import CTGANSynthesizer

In [None]:
synthesizer = CTGANSynthesizer(metadata, 
                               epochs=500,
                               enforce_rounding=True,
                               enforce_min_max_values=True,
                               verbose=True,
                               cuda=True)
synthesizer.fit(data)

Gen. (-1.09) | Discrim. (0.14):  32%|███▏      | 160/500 [2:23:07<5:10:48, 54.85s/it] 