In [1]:
# import required packages
import datetime as dt
import polars as pl
from metasynth import MetaDataset

# 1. Load data

In [2]:
import pandas as pd

# Step 1: Reading and converting your dataset to a pandas DataFrame
clomd_fp = "/Users/simonelewenhaupt/Desktop/masdv/clomd/clomd.csv"
df = pd.read_csv(clomd_fp, dtype={
    "sex": pd.CategoricalDtype(),
    "race": pd.CategoricalDtype(),
    "c_charge_degree": pd.CategoricalDtype(),
    "days_b_screening_arrest": float,
    "c_days_from_compas": float
})

In [3]:
df.head(50)

Unnamed: 0,sex,age,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,v_decile_score,two_year_recid,days_in_jail,days_in_custody
0,Male,67,Other,0,1,0,0,0,-1.0,1.0,F,1,0,0,7
1,Male,34,African-American,0,3,0,0,0,-1.0,1.0,F,1,1,10,10
2,Male,44,Other,0,1,0,0,0,0.0,0.0,M,1,0,1,1
3,Male,43,Other,0,4,0,0,3,-1.0,1.0,F,3,0,0,12
4,Male,41,African-American,0,4,0,0,0,-1.0,1.0,F,2,0,0,1
5,Male,37,Hispanic,0,1,0,0,0,0.0,0.0,M,1,0,0,0
6,Male,24,Hispanic,0,4,0,0,1,0.0,1.0,F,5,1,0,1
7,Male,32,Other,0,3,0,0,0,-1.0,1.0,M,4,0,1,2
8,Male,30,Caucasian,0,9,0,0,9,0.0,0.0,F,10,1,44,1
9,Male,49,Other,0,3,0,0,7,-1.0,0.0,F,2,1,3,1


In [4]:
dict(zip(df.columns, df.dtypes))

{'sex': CategoricalDtype(categories=['Female', 'Male'], ordered=False),
 'age': dtype('int64'),
 'race': CategoricalDtype(categories=['African-American', 'Asian', 'Caucasian', 'Hispanic',
                   'Native American', 'Other'],
 , ordered=False),
 'juv_fel_count': dtype('int64'),
 'decile_score': dtype('int64'),
 'juv_misd_count': dtype('int64'),
 'juv_other_count': dtype('int64'),
 'priors_count': dtype('int64'),
 'days_b_screening_arrest': dtype('float64'),
 'c_days_from_compas': dtype('float64'),
 'c_charge_degree': CategoricalDtype(categories=['F', 'M'], ordered=False),
 'v_decile_score': dtype('int64'),
 'two_year_recid': dtype('int64'),
 'days_in_jail': dtype('int64'),
 'days_in_custody': dtype('int64')}

# Step 2: Creating a MetaDataset object from a DataFrame

In [5]:
meta_dataset = MetaDataset.from_dataframe(df)

In [6]:
print(meta_dataset)

# Rows: 6162
# Columns: 15

{'name': 'sex', 'description': None, 'type': 'categorical', 'dtype': 'Categorical', 'prop_missing': 0.0, 'distribution': "{'name': 'MultinoulliDistribution', 'parameters': {'labels': array(['Female', 'Male'], dtype='<U6'), 'probs': array([0.18987342, 0.81012658])}}"}

{'name': 'age', 'description': None, 'type': 'discrete', 'dtype': 'Int64', 'prop_missing': 0.0, 'distribution': "{'name': 'DiscreteUniformDistribution', 'parameters': {'low': 20, 'high': 68}}"}

{'name': 'race', 'description': None, 'type': 'categorical', 'dtype': 'Categorical', 'prop_missing': 0.0, 'distribution': "{'name': 'MultinoulliDistribution', 'parameters': {'labels': array(['African-American', 'Asian', 'Caucasian', 'Hispanic',\n       'Native American', 'Other'], dtype='<U16'), 'probs': array([0.5150925 , 0.00503083, 0.33998702, 0.08260305, 0.00178513,\n       0.05550146])}}"}

{'name': 'juv_fel_count', 'description': None, 'type': 'discrete', 'dtype': 'Int64', 'prop_missing': 0.0, 'di

# Step 3: Saving the metadata in a file

In [7]:
file_path = "demonstration_metadata.json"
meta_dataset.to_json(file_path)

# Step 4: Generating synthetic data from the metadata

In [8]:
new_meta_dataset = MetaDataset.from_json(file_path)
new_meta_dataset.synthesize(20)

sex,age,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,v_decile_score,two_year_recid,days_in_jail,days_in_custody
cat,i64,cat,i64,i64,i64,i64,i64,f64,f64,cat,i64,i64,i64,i64
"""Male""",34,"""Caucasian""",0,1,0,0,0,-9.396424,1.87073,"""F""",2,0,196,249
"""Male""",55,"""African-Americ…",0,3,0,0,0,0.613765,18.251884,"""F""",2,0,43,61
"""Male""",42,"""African-Americ…",0,4,0,0,15,1.094885,8.09657,"""M""",5,0,81,155
"""Male""",28,"""Caucasian""",0,4,0,1,15,0.508859,1.791039,"""M""",6,1,75,366
"""Male""",42,"""African-Americ…",0,2,0,0,14,0.632312,3.918542,"""F""",9,1,183,391
"""Male""",58,"""African-Americ…",0,6,0,0,7,-4.778446,0.926732,"""F""",10,0,147,19
"""Male""",30,"""Caucasian""",0,10,0,1,6,-5.216228,1.737038,"""M""",2,0,231,85
"""Male""",55,"""African-Americ…",0,10,0,0,8,-10.856068,15.085779,"""F""",10,0,35,361
"""Male""",46,"""African-Americ…",0,5,0,0,11,1.47302,14.470072,"""F""",3,0,238,31
"""Male""",59,"""African-Americ…",0,10,0,0,8,1.050759,18.594473,"""M""",10,0,25,318


In [9]:
new_meta_dataset = MetaDataset.from_json(file_path)
synthetic_data = new_meta_dataset.synthesize(6162)
synthetic_data.head(10)

sex,age,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,v_decile_score,two_year_recid,days_in_jail,days_in_custody
cat,i64,cat,i64,i64,i64,i64,i64,f64,f64,cat,i64,i64,i64,i64
"""Male""",35,"""African-Americ…",0,4,0,0,17,-12.439753,7.874705,"""F""",2,0,120,426
"""Female""",64,"""African-Americ…",0,7,0,0,11,2.718607,8.151616,"""F""",8,0,100,333
"""Male""",65,"""African-Americ…",0,6,0,0,5,-6.669923,2.56537,"""F""",9,0,25,384
"""Male""",54,"""Caucasian""",0,1,0,0,11,-1.419049,10.001256,"""F""",10,0,147,455
"""Male""",60,"""African-Americ…",0,9,0,0,7,3.317399,0.103648,"""F""",2,0,1,317
"""Male""",62,"""Caucasian""",0,7,0,0,18,0.744694,1.080853,"""F""",1,0,29,202
"""Male""",50,"""African-Americ…",0,9,0,1,5,2.116933,1.465094,"""M""",7,0,211,323
"""Male""",49,"""African-Americ…",0,10,0,0,7,-4.192659,2.321714,"""F""",5,0,142,87
"""Female""",45,"""African-Americ…",0,2,1,0,14,-9.823011,4.061464,"""F""",8,0,119,266
"""Male""",61,"""African-Americ…",0,10,0,0,7,-3.378701,1.444145,"""F""",2,1,167,291


# Step 5 save data

In [15]:
# Save the synthetic data to a CSV file with column names
df = pd.DataFrame(synthetic_data, columns=df.columns)
df.to_csv('synthetic_data.csv', index=False)

# Step 6 create and save per race x1000

In [35]:
import pandas as pd

# Load the synthetic data from the CSV file
df = pd.read_csv('synthetic_data.csv')

# Count the occurrences of each race in the synthetic dataset
race_counts = df['race'].value_counts()

# Set the desired number of cases for each race (1000 in this case)
desired_cases = 1000

# Create an empty list to store the synthetic datasets for each race
synthetic_datasets = []

# Iterate over each race and create a synthetic dataset with the desired number of cases
for race in race_counts.index:
    count = race_counts[race]
    if count < desired_cases:
        # Generate additional synthetic cases if the current count is less than the desired number
        additional_cases = desired_cases - count
        synthetic_cases = new_meta_dataset.synthesize(additional_cases)
        synthetic_cases = synthetic_cases.to_pandas()  # Convert to Pandas DataFrame
        synthetic_cases['race'] = race  # Assign the race to the synthetic cases
        synthetic_cases = pd.DataFrame(synthetic_cases, columns=df.columns)  # Convert back to desired DataFrame format
        synthetic_datasets.append(synthetic_cases)
    elif count > desired_cases:
        # Randomly sample cases if the current count is greater than the desired number
        sampled_cases = df[df['race'] == race].sample(n=desired_cases, replace=True)
        synthetic_datasets.append(sampled_cases)

# Concatenate all synthetic datasets into a single DataFrame
synthetic_data_balanced = pd.concat(synthetic_datasets, ignore_index=True)

# Shuffle the synthetic dataset to randomize the order of cases
synthetic_data_balanced = synthetic_data_balanced.sample(frac=1).reset_index(drop=True)

# Print the head of the synthetic dataset
print(synthetic_data_balanced.head(10))

# Save the synthetic balanced dataset to a new CSV file
synthetic_data_balanced.to_csv('synthetic_data_balanced.csv', index=False)


      sex  age              race  juv_fel_count  decile_score  juv_misd_count  \
0    Male   46   Native American              0             5               0   
1  Female   23  African-American              0             6               0   
2    Male   55  African-American              0            10               0   
3    Male   25          Hispanic              0             9               0   
4    Male   22             Asian              0             2               0   
5    Male   54          Hispanic              0             9               0   
6    Male   58         Caucasian              0             8               0   
7    Male   40             Asian              0             1               0   
8    Male   45          Hispanic              0             7               0   
9    Male   58         Caucasian              0            10               0   

   juv_other_count  priors_count  days_b_screening_arrest  c_days_from_compas  \
0                0         

In [37]:
synthetic_data_balanced.head(6500)

Unnamed: 0,sex,age,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,v_decile_score,two_year_recid,days_in_jail,days_in_custody
0,Male,46,Native American,0,5,0,0,4,-8.155968,21.673898,F,7,0,194,301
1,Female,23,African-American,0,6,0,0,15,3.800742,0.874302,M,6,1,215,226
2,Male,55,African-American,0,10,0,0,22,-0.794987,3.222684,F,3,0,111,238
3,Male,25,Hispanic,0,9,0,0,12,0.281596,2.908143,F,1,1,122,347
4,Male,22,Asian,0,2,0,0,6,4.934344,2.271158,M,6,0,124,229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5133,Female,64,Caucasian,0,2,0,0,13,3.205603,10.247607,F,10,1,26,459
5134,Female,66,African-American,0,1,0,0,11,-4.462303,9.597530,F,6,1,42,456
5135,Male,21,Native American,0,9,0,0,18,-0.094375,10.439059,M,3,1,106,332
5136,Male,27,Hispanic,0,10,0,0,10,0.626227,3.211233,F,6,0,44,130


In [38]:
import pandas as pd

# Load the synthetic data from the CSV file
df = pd.read_csv('synthetic_data.csv')

# Count the occurrences of each race in the synthetic dataset
race_counts = df['race'].value_counts()

# Set the desired number of cases for each race (1000 in this case)
desired_cases = 1000

# Create an empty list to store the synthetic datasets for each race
synthetic_datasets = []

# Iterate over each race and create a synthetic dataset with the desired number of cases
for race in race_counts.index:
    count = race_counts[race]
    if count < desired_cases:
        # Oversample cases for minority categories by duplicating existing cases
        oversampled_cases = df[df['race'] == race]
        while oversampled_cases.shape[0] < desired_cases:
            oversampled_cases = pd.concat([oversampled_cases, oversampled_cases], ignore_index=True)
        oversampled_cases = oversampled_cases.sample(n=desired_cases, replace=True)
        synthetic_datasets.append(oversampled_cases)
    elif count > desired_cases:
        # Randomly sample cases if the current count is greater than the desired number
        sampled_cases = df[df['race'] == race].sample(n=desired_cases, replace=True)
        synthetic_datasets.append(sampled_cases)

# Concatenate all synthetic datasets into a single DataFrame
synthetic_data_balanced = pd.concat(synthetic_datasets, ignore_index=True)

# Shuffle the synthetic dataset to randomize the order of cases
synthetic_data_balanced = synthetic_data_balanced.sample(frac=1).reset_index(drop=True)

# Save the synthetic balanced dataset to a new CSV file
synthetic_data_balanced.to_csv('synthetic_data_balanced2.csv', index=False)

# Step 7 Create and save 100 per race

In [10]:
import pandas as pd

# Load the synthetic data from the CSV file
df = pd.read_csv('synthetic_data.csv')

# Count the occurrences of each race in the synthetic dataset
race_counts = df['race'].value_counts()

# Set the desired number of cases for each race (1000 in this case)
desired_cases = 100

# Create an empty list to store the synthetic datasets for each race
synthetic_datasets = []

# Iterate over each race and create a synthetic dataset with the desired number of cases
for race in race_counts.index:
    count = race_counts[race]
    if count < desired_cases:
        # Oversample cases for minority categories by duplicating existing cases
        oversampled_cases = df[df['race'] == race]
        while oversampled_cases.shape[0] < desired_cases:
            oversampled_cases = pd.concat([oversampled_cases, oversampled_cases], ignore_index=True)
        oversampled_cases = oversampled_cases.sample(n=desired_cases, replace=True)
        synthetic_datasets.append(oversampled_cases)
    elif count > desired_cases:
        # Randomly sample cases if the current count is greater than the desired number
        sampled_cases = df[df['race'] == race].sample(n=desired_cases, replace=True)
        synthetic_datasets.append(sampled_cases)

# Concatenate all synthetic datasets into a single DataFrame
synthetic_data_balanced = pd.concat(synthetic_datasets, ignore_index=True)

# Shuffle the synthetic dataset to randomize the order of cases
synthetic_data_balanced = synthetic_data_balanced.sample(frac=1).reset_index(drop=True)

# Save the synthetic balanced dataset to a new CSV file
synthetic_data_balanced.to_csv('sdb_100.csv', index=False)

In [11]:
synthetic_data_balanced.head(700)

Unnamed: 0,sex,age,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,v_decile_score,two_year_recid,days_in_jail,days_in_custody
0,Male,30,Native American,0,5,0,0,9,0.066272,9.413483,M,10,1,107,9
1,Male,42,Other,0,1,0,0,7,-15.152596,4.752637,M,4,0,35,245
2,Male,20,African-American,0,1,0,0,12,-0.488641,0.641734,F,7,1,110,282
3,Male,62,Asian,0,9,0,0,1,-5.137740,3.737987,M,8,1,195,458
4,Male,46,Native American,0,3,0,0,2,4.361641,9.561705,M,3,1,12,249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,Male,40,Other,0,1,0,0,13,-3.322206,2.722950,F,6,0,181,45
596,Male,39,Caucasian,0,8,0,0,17,2.090334,6.203848,F,3,1,121,313
597,Male,46,Native American,0,3,0,0,2,4.361641,9.561705,M,3,1,12,249
598,Male,51,Hispanic,0,2,0,0,10,-2.413088,6.350607,M,5,0,127,22


# Step 8 Create and save 10 000 per race

In [12]:
import pandas as pd

# Load the synthetic data from the CSV file
df = pd.read_csv('synthetic_data.csv')

# Count the occurrences of each race in the synthetic dataset
race_counts = df['race'].value_counts()

# Set the desired number of cases for each race (1000 in this case)
desired_cases = 10000

# Create an empty list to store the synthetic datasets for each race
synthetic_datasets = []

# Iterate over each race and create a synthetic dataset with the desired number of cases
for race in race_counts.index:
    count = race_counts[race]
    if count < desired_cases:
        # Oversample cases for minority categories by duplicating existing cases
        oversampled_cases = df[df['race'] == race]
        while oversampled_cases.shape[0] < desired_cases:
            oversampled_cases = pd.concat([oversampled_cases, oversampled_cases], ignore_index=True)
        oversampled_cases = oversampled_cases.sample(n=desired_cases, replace=True)
        synthetic_datasets.append(oversampled_cases)
    elif count > desired_cases:
        # Randomly sample cases if the current count is greater than the desired number
        sampled_cases = df[df['race'] == race].sample(n=desired_cases, replace=True)
        synthetic_datasets.append(sampled_cases)

# Concatenate all synthetic datasets into a single DataFrame
synthetic_data_balanced = pd.concat(synthetic_datasets, ignore_index=True)

# Shuffle the synthetic dataset to randomize the order of cases
synthetic_data_balanced = synthetic_data_balanced.sample(frac=1).reset_index(drop=True)

# Save the synthetic balanced dataset to a new CSV file
synthetic_data_balanced.to_csv('sdb_10000.csv', index=False)

In [13]:
synthetic_data_balanced.head(65000)

Unnamed: 0,sex,age,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,v_decile_score,two_year_recid,days_in_jail,days_in_custody
0,Male,39,African-American,0,4,0,0,15,-2.921104,10.929692,F,1,0,139,281
1,Male,20,Hispanic,0,4,0,0,1,-2.411887,3.309705,F,3,1,63,123
2,Male,48,Other,0,1,0,0,18,-6.832035,0.427581,F,5,1,30,428
3,Female,64,Native American,0,7,0,0,10,-6.397897,4.384874,F,6,0,68,237
4,Male,24,Other,0,3,0,0,1,-6.637167,1.803205,F,4,1,114,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,Male,23,Other,0,9,0,0,10,-1.435060,5.453714,F,10,1,35,160
59996,Female,51,African-American,0,5,0,0,22,-7.731239,0.823479,M,7,0,41,98
59997,Female,32,Other,0,4,0,0,13,-3.025999,11.074741,M,7,1,1,318
59998,Male,59,Other,0,2,0,0,6,-12.328107,2.313381,F,8,0,145,451
