In [2]:
import pandas as pd
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Load the data
file_path = r"C:\Users\Lenovo\OneDrive - University of Leeds\Project\mimic-iv-clinical-database-demo-2.2\mimic-iv-clinical-database-demo-2.2\microbiologyevents.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   microevent_id  subject_id     hadm_id  micro_specimen_id order_provider_id  \
0             36    10000032  25742920.0            7814634               NaN   
1             15    10000032  22595853.0            5717063               NaN   
2             32    10000032  29079034.0            5901894               NaN   
3           7013    10020944  29974575.0            4646730               NaN   
4          12898    10037975  27617929.0            1636367               NaN   

             chartdate            charttime  spec_itemid spec_type_desc  \
0  2180-08-06 00:00:00  2180-08-06 20:35:00        70070           SWAB   
1  2180-05-07 00:00:00  2180-05-07 00:19:00        70070           SWAB   
2  2180-07-24 00:00:00  2180-07-24 00:55:00        70070           SWAB   
3  2131-02-27 00:00:00  2131-02-27 17:41:00        70070           SWAB   
4  2185-01-17 00:00:00  2185-01-17 21:32:00        70070           SWAB   

   test_seq  ...          org_n

In [3]:
# Calculate the number of null values in each column
null_values = data.isnull().sum()

# Display the total number of columns and rows
total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 25
Total rows: 2899
-------------
microevent_id             0
subject_id                0
hadm_id                 971
micro_specimen_id         0
order_provider_id      2596
chartdate                 0
charttime               183
spec_itemid               0
spec_type_desc            0
test_seq                  0
storedate                16
storetime                16
test_itemid               0
test_name                 0
org_itemid             1641
org_name               1641
isolate_num            1641
quantity               2899
ab_itemid              1863
ab_name                1863
dilution_text          1900
dilution_comparison    1901
dilution_value         1901
interpretation         1863
comments                811
dtype: int64


In [4]:
# Deal with null values
data['charttime'].fillna(data['chartdate'], inplace=True)

data['org_itemid'].fillna(value=0, inplace=True)
data['isolate_num'].fillna(value=0, inplace=True)
data['ab_itemid'].fillna(value=0, inplace=True)

data['org_name'].fillna(value='Na', inplace=True)
data['ab_name'].fillna(value='Na', inplace=True)
data['dilution_comparison'].fillna(value='Na', inplace=True)
data['dilution_text'].fillna(value='Na', inplace=True)

data['dilution_value'].fillna(value=999, inplace=True)


In [5]:
# Calculate the number of null values in each column
null_values = data.isnull().sum()

# Display the total number of columns and rows
total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 25
Total rows: 2899
-------------
microevent_id             0
subject_id                0
hadm_id                 971
micro_specimen_id         0
order_provider_id      2596
chartdate                 0
charttime                 0
spec_itemid               0
spec_type_desc            0
test_seq                  0
storedate                16
storetime                16
test_itemid               0
test_name                 0
org_itemid                0
org_name                  0
isolate_num               0
quantity               2899
ab_itemid                 0
ab_name                   0
dilution_text             0
dilution_comparison       0
dilution_value            0
interpretation         1863
comments                811
dtype: int64


In [6]:
data.dtypes

microevent_id            int64
subject_id               int64
hadm_id                float64
micro_specimen_id        int64
order_provider_id       object
chartdate               object
charttime               object
spec_itemid              int64
spec_type_desc          object
test_seq                 int64
storedate               object
storetime               object
test_itemid              int64
test_name               object
org_itemid             float64
org_name                object
isolate_num            float64
quantity               float64
ab_itemid              float64
ab_name                 object
dilution_text           object
dilution_comparison     object
dilution_value         float64
interpretation          object
comments                object
dtype: object

In [7]:
data['charttime'] = pd.to_datetime(data['charttime'])

In [8]:
data.dtypes

microevent_id                   int64
subject_id                      int64
hadm_id                       float64
micro_specimen_id               int64
order_provider_id              object
chartdate                      object
charttime              datetime64[ns]
spec_itemid                     int64
spec_type_desc                 object
test_seq                        int64
storedate                      object
storetime                      object
test_itemid                     int64
test_name                      object
org_itemid                    float64
org_name                       object
isolate_num                   float64
quantity                      float64
ab_itemid                     float64
ab_name                        object
dilution_text                  object
dilution_comparison            object
dilution_value                float64
interpretation                 object
comments                       object
dtype: object

In [13]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import time

In [9]:
df_data = data.head(1000)


In [11]:
# Define the metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_data)


In [14]:
# Initialize the CTGANSynthesizer
custom_synthesizer = CTGANSynthesizer(metadata, epochs=1000)

# Start timing
start_time = time.time()

# Train the synthesizer on the real data
custom_synthesizer.fit(df_data)

# End timing
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Training time: {elapsed_time} seconds")



Training time: 440.2530782222748 seconds


In [15]:
# Generate synthetic data
synthetic_data = custom_synthesizer.sample(1000)  # Generate 1000 rows of synthetic data

# Display the first few rows of synthetic data
print(synthetic_data.head())

   microevent_id  subject_id     hadm_id  micro_specimen_id order_provider_id  \
0      841127031    10000699  24928736.0            7182374               NaN   
1      499772491    10000277  28939759.0            1744635               NaN   
2       47667669    10018199         NaN            7664106               NaN   
3      587376802    10008505  25949319.0            7870782            P70R8X   
4      351342487    10008841         NaN            5467954               NaN   

             chartdate           charttime  spec_itemid     spec_type_desc  \
0  2200-09-22 10:55:27 2147-04-29 09:26:15        70016      BLOOD CULTURE   
1  2153-03-21 08:58:58 2149-08-16 13:17:44        70012      BLOOD CULTURE   
2  2196-02-28 23:47:09 2169-05-11 04:57:45        70012  Staph aureus swab   
3  2158-03-24 17:15:22 2148-07-07 16:03:17        70090      BLOOD CULTURE   
4  2186-07-01 15:12:10 2201-11-09 01:15:00        70015      BLOOD CULTURE   

   test_seq  ...             org_name isolat

In [17]:
# Update the 'subject_id' to start from 400000
synthetic_data['subject_id'] = range(400000, 400000 + len(synthetic_data))
synthetic_data['hadm_id'] = synthetic_data['subject_id'].apply(lambda x: f"{x}{str(synthetic_data.index.get_loc(synthetic_data.index[synthetic_data['subject_id'] == x][0])).zfill(2)}")



In [18]:
synthetic_data.to_csv('synthetic_microbiology_events.csv', index=False)

print("Synthetic data generated, updated, and saved successfully.")

Synthetic data generated, updated, and saved successfully.


In [19]:
# Calculate the number of null values in each column
null_values = synthetic_data.isnull().sum()

# Display the total number of columns and rows
total_columns = synthetic_data.shape[1]
total_rows = synthetic_data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 25
Total rows: 1000
-------------
microevent_id             0
subject_id                0
hadm_id                   0
micro_specimen_id         0
order_provider_id       820
chartdate                 0
charttime                 0
spec_itemid               0
spec_type_desc            0
test_seq                  0
storedate                 0
storetime                 0
test_itemid               0
test_name                 0
org_itemid                0
org_name                  0
isolate_num               0
quantity               1000
ab_itemid                 0
ab_name                   0
dilution_text             0
dilution_comparison       0
dilution_value            0
interpretation          640
comments                369
dtype: int64


In [20]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata
)


Generating report ...

(1/2) Evaluating Data Validity: |████████████████████████████████████████████████████| 25/25 [00:00<00:00, 756.31it/s]|
Data Validity Score: 95.65%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 223.41it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 97.83%



In [21]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    data,
    synthetic_data,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |████████████████████████████████████████████████████| 25/25 [00:00<00:00, 176.25it/s]|
Column Shapes Score: 72.54%

(2/2) Evaluating Column Pair Trends: |██████████████████████████████████████████████| 300/300 [00:05<00:00, 52.99it/s]|
Column Pair Trends Score: 65.92%

Overall Score (Average): 69.23%



In [22]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score,Error
0,subject_id,KSComplement,0.0,
1,hadm_id,KSComplement,,TypeError: '<' not supported between instances...
2,micro_specimen_id,KSComplement,0.953791,
3,order_provider_id,TVComplement,0.572497,
4,chartdate,KSComplement,0.908277,
5,charttime,KSComplement,0.851153,
6,spec_itemid,KSComplement,0.680166,
7,spec_type_desc,TVComplement,0.483357,
8,test_seq,TVComplement,0.765301,
9,storedate,KSComplement,0.917398,


In [25]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=data,
    synthetic_data=synthetic_data,
    column_name='ab_name',
    metadata=metadata
)

fig.show()