### Creating a fake case-control data set for arbitrarly sized case-control groups
This notebook builds upon the fake_dataset notebook to create datasets where we may have multiple case samples within the same case-control group. This can be the situation when whe have multiple sample time points of the same case object. An example is when the case objects is sampled before, around and after diagnosis X, and we match the case with a control sample taken around the time of the first sample time point of the case


### Setup libs

In [42]:
import pandas as pd
import numpy as np

import faker
from faker_biology.physiology import CellType, Organ, Organelle

from dateutil.relativedelta import relativedelta


# init fake object and load plugins
fake = faker.Faker()

# add organ data plugin 
fake.add_provider(Organ)

# or organelle and celtype plugin
# fake.add_provider(Organelle)
# fake.add_provider(CellType)

### Set parameters

In [43]:
# save
saveop = True

# number of case control pairs in fake study 
n_pairs = 523 

# age of subjects
min_age = 15 # years
max_age = 40 # years

# max age difference between case and control
max_diff_years = 0
max_diff_months = 0
max_diff_days = 7

# number of sub groups; here organs to simulate some attribute of the disease/condition
n_groups = 5
organs = [fake.organ() for _ in range(0,n_groups)]

variables = ["pair_ID", "specimen_ID", "object", "date_of_birth", "year", "barcode", "organ"]

# create dict to hold the fake data 
fake_data = {}
for v in variables:
    fake_data.setdefault(v, [])

In [1]:
import pandas as pd
import numpy as np
import faker
from faker_biology.physiology import CellType, Organ, Organelle
from dateutil.relativedelta import relativedelta

# init fake object and load plugins
fake = faker.Faker()
fake.add_provider(Organ)

# save
saveop = True

# Group configuration
max_cases_per_group = 6  # Maximum number of cases per group

# if randomize group size
random_size = True
min_cases_per_group = 3  # Minimum  number of cases per group


# Total number of groups in the study
n_groups_total = 100

# Age of subjects
min_age = 15  # years
max_age = 40  # years

# Max age difference between case and control within the same group
max_diff_years = 0
max_diff_months = 0
max_diff_days = 7

# Number of subgroups; here organs to simulate some attribute of the disease/condition
n_subgroups = 5
organs = [fake.organ() for _ in range(n_subgroups)]

variables = ["group_ID", "specimen_ID", "object", "date_of_birth", "year", "barcode", "organ"]

# create dict to hold the fake data
fake_data = {v: [] for v in variables}

# Assuming each group has one control and a variable number of cases
spec_id = list(np.arange(1, n_groups_total * (max_cases_per_group + 1) + 1))

# Generating groups
for group_id in range(n_groups_total):
    group_organ = np.random.choice(organs)

    if random_size:
        # Randomly determine the number of cases in each group between 3 to 6
        n_cases = np.random.randint(min_cases_per_group, max_cases_per_group + 1)
    else:
        n_cases = max_cases_per_group

    # Total group size (cases + 1 control)
    group_size = n_cases + 1

    # Determine base date of birth for cases
    base_dob_cases = fake.date_of_birth(minimum_age=min_age, maximum_age=max_age)

    # Determine base date of birth for control
    base_dob_control = fake.date_between_dates(base_dob_cases, 
                                               base_dob_cases + relativedelta(years=max_diff_years, 
                                                                              months=max_diff_months, 
                                                                              days=max_diff_days))

    for member_id in range(group_size):
        # Assign specimen ID
        spec_id_current = spec_id[group_id * (max_cases_per_group + 1) + member_id]

        # Determine if current member is a case or control
        object_type = "Case" if member_id < n_cases else "Control"

        # Assign date of birth based on the object type
        dob = base_dob_cases if object_type == "Case" else base_dob_control

        # Append data to the fake data dictionary
        fake_data["group_ID"].append(group_id)
        fake_data["object"].append(object_type)
        fake_data["specimen_ID"].append(spec_id_current)
        fake_data["date_of_birth"].append(dob)
        fake_data["year"].append(dob.year)
        fake_data["barcode"].append(fake.ean8())
        fake_data["organ"].append(group_organ)

# Convert the dictionary to a DataFrame (if needed)
fake_df = pd.DataFrame(fake_data)


In [2]:
fake_df.head(20)

Unnamed: 0,group_ID,specimen_ID,object,date_of_birth,year,barcode,organ
0,0,1,Case,2002-07-24,2002,9016361,Fallopian tubes
1,0,2,Case,2002-07-24,2002,69686641,Fallopian tubes
2,0,3,Case,2002-07-24,2002,36531073,Fallopian tubes
3,0,4,Case,2002-07-24,2002,45545023,Fallopian tubes
4,0,5,Case,2002-07-24,2002,78336636,Fallopian tubes
5,0,6,Control,2002-07-28,2002,6834210,Fallopian tubes
6,1,8,Case,1997-01-10,1997,40172316,Stomach
7,1,9,Case,1997-01-10,1997,82117399,Stomach
8,1,10,Case,1997-01-10,1997,53768414,Stomach
9,1,11,Case,1997-01-10,1997,50945894,Stomach
