In [4]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Loading the data
file_path = r"C:\Users\Lenovo\OneDrive - University of Leeds\Project\mimic-iv-clinical-database-demo-2.2\mimic-iv-clinical-database-demo-2.2\microbiologyevents.csv"
data = pd.read_csv(file_path)

# Displaying the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   microevent_id  subject_id     hadm_id  micro_specimen_id order_provider_id  \
0             36    10000032  25742920.0            7814634               NaN   
1             15    10000032  22595853.0            5717063               NaN   
2             32    10000032  29079034.0            5901894               NaN   
3           7013    10020944  29974575.0            4646730               NaN   
4          12898    10037975  27617929.0            1636367               NaN   

             chartdate            charttime  spec_itemid spec_type_desc  \
0  2180-08-06 00:00:00  2180-08-06 20:35:00        70070           SWAB   
1  2180-05-07 00:00:00  2180-05-07 00:19:00        70070           SWAB   
2  2180-07-24 00:00:00  2180-07-24 00:55:00        70070           SWAB   
3  2131-02-27 00:00:00  2131-02-27 17:41:00        70070           SWAB   
4  2185-01-17 00:00:00  2185-01-17 21:32:00        70070           SWAB   

   test_seq  ...          org_n

In [5]:
null_values = data.isnull().sum()

total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

print(null_values)


Total columns: 25
Total rows: 2899
-------------
microevent_id             0
subject_id                0
hadm_id                 971
micro_specimen_id         0
order_provider_id      2596
chartdate                 0
charttime               183
spec_itemid               0
spec_type_desc            0
test_seq                  0
storedate                16
storetime                16
test_itemid               0
test_name                 0
org_itemid             1641
org_name               1641
isolate_num            1641
quantity               2899
ab_itemid              1863
ab_name                1863
dilution_text          1900
dilution_comparison    1901
dilution_value         1901
interpretation         1863
comments                811
dtype: int64


In [6]:
# Dealing with null values
data['charttime'].fillna(data['chartdate'], inplace=True)

data['org_itemid'].fillna(value=0, inplace=True)
data['isolate_num'].fillna(value=0, inplace=True)
data['ab_itemid'].fillna(value=0, inplace=True)

data['org_name'].fillna(value='Na', inplace=True)
data['ab_name'].fillna(value='Na', inplace=True)
data['dilution_comparison'].fillna(value='Na', inplace=True)
data['dilution_text'].fillna(value='Na', inplace=True)

data['dilution_value'].fillna(value=999, inplace=True)


In [7]:
null_values = data.isnull().sum()

total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

print(null_values)


Total columns: 25
Total rows: 2899
-------------
microevent_id             0
subject_id                0
hadm_id                 971
micro_specimen_id         0
order_provider_id      2596
chartdate                 0
charttime                 0
spec_itemid               0
spec_type_desc            0
test_seq                  0
storedate                16
storetime                16
test_itemid               0
test_name                 0
org_itemid                0
org_name                  0
isolate_num               0
quantity               2899
ab_itemid                 0
ab_name                   0
dilution_text             0
dilution_comparison       0
dilution_value            0
interpretation         1863
comments                811
dtype: int64


In [8]:
data.dtypes

microevent_id            int64
subject_id               int64
hadm_id                float64
micro_specimen_id        int64
order_provider_id       object
chartdate               object
charttime               object
spec_itemid              int64
spec_type_desc          object
test_seq                 int64
storedate               object
storetime               object
test_itemid              int64
test_name               object
org_itemid             float64
org_name                object
isolate_num            float64
quantity               float64
ab_itemid              float64
ab_name                 object
dilution_text           object
dilution_comparison     object
dilution_value         float64
interpretation          object
comments                object
dtype: object

In [9]:
data['charttime'] = pd.to_datetime(data['charttime'])

In [10]:
data.dtypes

microevent_id                   int64
subject_id                      int64
hadm_id                       float64
micro_specimen_id               int64
order_provider_id              object
chartdate                      object
charttime              datetime64[ns]
spec_itemid                     int64
spec_type_desc                 object
test_seq                        int64
storedate                      object
storetime                      object
test_itemid                     int64
test_name                      object
org_itemid                    float64
org_name                       object
isolate_num                   float64
quantity                      float64
ab_itemid                     float64
ab_name                        object
dilution_text                  object
dilution_comparison            object
dilution_value                float64
interpretation                 object
comments                       object
dtype: object

In [11]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import time

In [12]:
df_data = data.head(1000)


In [13]:
# Define the metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_data)


In [14]:
# Initialising the CTGANSynthesizer
custom_synthesizer = CTGANSynthesizer(metadata, epochs=1000)

start_time = time.time()

custom_synthesizer.fit(df_data)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Training time: {elapsed_time} seconds")



Training time: 439.2030072212219 seconds


In [15]:
synthetic_data = custom_synthesizer.sample(1000)  # Generate 1000 rows of synthetic data

print(synthetic_data.head())

   microevent_id  subject_id     hadm_id  micro_specimen_id order_provider_id  \
0      841127031    10037466  20204263.0            5749432               NaN   
1      499772491    10002544  28248083.0            5738973               NaN   
2       47667669    10017042         NaN            2249863               NaN   
3      587376802    10017335  27224838.0            9136138            P70R8X   
4      351342487    10013312         NaN            7882883               NaN   

             chartdate           charttime  spec_itemid     spec_type_desc  \
0  2110-01-04 00:00:00 2118-07-09 20:20:28        70059  Staph aureus swab   
1  2183-05-14 04:54:24 2170-08-19 12:37:22        70012      BLOOD CULTURE   
2  2182-03-08 21:14:27 2137-03-23 15:45:31        70012  Staph aureus swab   
3  2135-07-23 01:31:24 2138-09-10 16:10:44        70084      BLOOD CULTURE   
4  2127-10-29 21:32:28 2114-08-12 12:31:30        70012        MRSA SCREEN   

   test_seq  ...             org_name isolat

In [16]:
synthetic_data['subject_id'] = range(400000, 400000 + len(synthetic_data))
synthetic_data['hadm_id'] = synthetic_data['subject_id'].apply(lambda x: f"{x}{str(synthetic_data.index.get_loc(synthetic_data.index[synthetic_data['subject_id'] == x][0])).zfill(2)}")



In [17]:
synthetic_data.to_csv('synthetic_microbiology_events.csv', index=False)

print("Synthetic data generated, updated, and saved successfully.")

Synthetic data generated, updated, and saved successfully.


In [18]:
null_values = synthetic_data.isnull().sum()

total_columns = synthetic_data.shape[1]
total_rows = synthetic_data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

print(null_values)


Total columns: 25
Total rows: 1000
-------------
microevent_id             0
subject_id                0
hadm_id                   0
micro_specimen_id         0
order_provider_id       772
chartdate                 0
charttime                 0
spec_itemid               0
spec_type_desc            0
test_seq                  0
storedate                 0
storetime                 0
test_itemid               0
test_name                 0
org_itemid                0
org_name                  0
isolate_num               0
quantity               1000
ab_itemid                 0
ab_name                   0
dilution_text             0
dilution_comparison       0
dilution_value            0
interpretation          651
comments                366
dtype: int64


In [19]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata
)


Generating report ...

(1/2) Evaluating Data Validity: |███████████████████████████████████████████████████| 25/25 [00:00<00:00, 1202.83it/s]|
Data Validity Score: 95.65%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 198.36it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 97.83%



In [20]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    data,
    synthetic_data,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |████████████████████████████████████████████████████| 25/25 [00:00<00:00, 187.36it/s]|
Column Shapes Score: 71.25%

(2/2) Evaluating Column Pair Trends: |██████████████████████████████████████████████| 300/300 [00:04<00:00, 60.77it/s]|
Column Pair Trends Score: 65.89%

Overall Score (Average): 68.57%



In [21]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score,Error
0,subject_id,KSComplement,0.0,
1,hadm_id,KSComplement,,TypeError: '<' not supported between instances...
2,micro_specimen_id,KSComplement,0.883702,
3,order_provider_id,TVComplement,0.581249,
4,chartdate,KSComplement,0.833999,
5,charttime,KSComplement,0.754654,
6,spec_itemid,KSComplement,0.75564,
7,spec_type_desc,TVComplement,0.490372,
8,test_seq,TVComplement,0.816301,
9,storedate,KSComplement,0.812398,


In [22]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=data,
    synthetic_data=synthetic_data,
    column_name='ab_name',
    metadata=metadata
)

fig.show()

In [23]:
import pandas as pd

# Loading the provided data
data = [
    {'Column': 'subject_id', 'Metric': 'KSComplement', 'Score': 0.0},
    {'Column': 'hadm_id', 'Metric': 'KSComplement', 'Score': None, 'Error': "TypeError: '<' not supported between instances"},
    {'Column': 'micro_specimen_id', 'Metric': 'KSComplement', 'Score': 0.953791},
    {'Column': 'order_provider_id', 'Metric': 'TVComplement', 'Score': 0.572497},
    {'Column': 'chartdate', 'Metric': 'KSComplement', 'Score': 0.908277},
    {'Column': 'charttime', 'Metric': 'KSComplement', 'Score': 0.851153},
    {'Column': 'spec_itemid', 'Metric': 'KSComplement', 'Score': 0.680166},
    {'Column': 'spec_type_desc', 'Metric': 'TVComplement', 'Score': 0.483357},
    {'Column': 'test_seq', 'Metric': 'TVComplement', 'Score': 0.765301},
    {'Column': 'storedate', 'Metric': 'KSComplement', 'Score': 0.917398},
    {'Column': 'storetime', 'Metric': 'KSComplement', 'Score': 0.912398},
    {'Column': 'test_itemid', 'Metric': 'KSComplement', 'Score': 0.575716},
    {'Column': 'test_name', 'Metric': 'TVComplement', 'Score': 0.349431},
    {'Column': 'org_itemid', 'Metric': 'KSComplement', 'Score': 0.715943},
    {'Column': 'org_name', 'Metric': 'TVComplement', 'Score': 0.813541},
    {'Column': 'isolate_num', 'Metric': 'TVComplement', 'Score': 0.979648},
    {'Column': 'quantity', 'Metric': 'KSComplement', 'Score': None, 'Error': "ValueError: Data passed to ks_2samp must not be"},
    {'Column': 'ab_itemid', 'Metric': 'KSComplement', 'Score': 0.565365},
    {'Column': 'ab_name', 'Metric': 'TVComplement', 'Score': 0.903788},
    {'Column': 'dilution_text', 'Metric': 'TVComplement', 'Score': 0.896558},
    {'Column': 'dilution_comparison', 'Metric': 'TVComplement', 'Score': 0.975628},
    {'Column': 'dilution_value', 'Metric': 'KSComplement', 'Score': 0.594257},
    {'Column': 'interpretation', 'Metric': 'TVComplement', 'Score': 0.989640},
    {'Column': 'comments', 'Metric': 'TVComplement', 'Score': 0.554425}
]

df = pd.DataFrame(data)

thresholds = {
    'KSComplement': 0.7,
    'TVComplement': 0.7,
    'critical_columns': {
        'admittime': 0.8,
        'dischtime': 0.8,
        'admission_type': 0.9,
        'admission_location': 0.9,
        'discharge_location': 0.9,
        'insurance': 0.9,
        'marital_status': 0.9,
    },
    'non_critical_columns': {
        'subject_id': 0.0,
        'language': 0.0,
        'race': 0.0,
        'edregtime': 0.7,
        'edouttime': 0.7,
    }
}

def check_thresholds(df, thresholds):
    results = []
    for _, row in df.iterrows():
        column_name = row['Column']
        metric = row['Metric']
        score = row['Score']

        if column_name in thresholds['critical_columns']:
            threshold = thresholds['critical_columns'][column_name]
        else:
            threshold = thresholds['non_critical_columns'].get(column_name, thresholds.get(metric, 0))

        if pd.isna(score):
            results.append(f"{column_name} does not have a valid score. Error: {row['Error']}")
        elif score >= threshold:
            results.append(f"{column_name} meets the threshold with a score of {score:.2e}.")
        else:
            results.append(f"{column_name} does not meet the threshold. Score: {score:.2e}, Threshold: {threshold:.2e}")
    
    return results

results = check_thresholds(df, thresholds)

for result in results:
    print(result)


subject_id meets the threshold with a score of 0.00e+00.
hadm_id does not have a valid score. Error: TypeError: '<' not supported between instances
micro_specimen_id meets the threshold with a score of 9.54e-01.
order_provider_id does not meet the threshold. Score: 5.72e-01, Threshold: 7.00e-01
chartdate meets the threshold with a score of 9.08e-01.
charttime meets the threshold with a score of 8.51e-01.
spec_itemid does not meet the threshold. Score: 6.80e-01, Threshold: 7.00e-01
spec_type_desc does not meet the threshold. Score: 4.83e-01, Threshold: 7.00e-01
test_seq meets the threshold with a score of 7.65e-01.
storedate meets the threshold with a score of 9.17e-01.
storetime meets the threshold with a score of 9.12e-01.
test_itemid does not meet the threshold. Score: 5.76e-01, Threshold: 7.00e-01
test_name does not meet the threshold. Score: 3.49e-01, Threshold: 7.00e-01
org_itemid meets the threshold with a score of 7.16e-01.
org_name meets the threshold with a score of 8.14e-01.
