In [1]:
import pandas as pd
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Load the data
file_path = r"C:\Users\Lenovo\OneDrive - University of Leeds\Project\mimic-iv-clinical-database-demo-2.2\mimic-iv-clinical-database-demo-2.2\inputevents.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   subject_id   hadm_id   stay_id  caregiver_id            starttime  \
0    10005817  20626031  32604416          4793  2132-12-16 19:50:00   
1    10005817  20626031  32604416         92805  2132-12-15 20:15:00   
2    10005817  20626031  32604416         20310  2132-12-17 09:15:00   
3    10005817  20626031  32604416         79166  2132-12-16 09:36:00   
4    10005817  20626031  32604416         92805  2132-12-15 20:10:00   

               endtime            storetime  itemid  amount amountuom  ...  \
0  2132-12-16 19:51:00  2132-12-16 19:50:00  225798     1.0      dose  ...   
1  2132-12-15 20:16:00  2132-12-15 20:11:00  225798     1.0      dose  ...   
2  2132-12-17 09:16:00  2132-12-17 09:28:00  225798     1.0      dose  ...   
3  2132-12-16 09:37:00  2132-12-16 09:37:00  225798     1.0      dose  ...   
4  2132-12-15 21:10:00  2132-12-15 20:10:00  221456     2.0     grams  ...   

                       ordercomponenttypedescription ordercategoryd

In [2]:
# Calculate the number of null values in each column
null_values = data.isnull().sum()

# Display the total number of columns and rows
total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 26
Total rows: 20404
-------------
subject_id                          0
hadm_id                             0
stay_id                             0
caregiver_id                        0
starttime                           0
endtime                             0
storetime                           0
itemid                              0
amount                              0
amountuom                           0
rate                             9366
rateuom                          9366
orderid                             0
linkorderid                         0
ordercategoryname                   0
secondaryordercategoryname       6260
ordercomponenttypedescription       0
ordercategorydescription            0
patientweight                       0
totalamount                      3314
totalamountuom                   3312
isopenbag                           0
continueinnextdept                  0
statusdescription                   0
originalamount                      0


In [4]:
# Convert 'starttime', 'endtime', and 'storetime' to datetime
data['starttime'] = pd.to_datetime(data['starttime'], errors='coerce')
data['endtime'] = pd.to_datetime(data['endtime'], errors='coerce')
data['storetime'] = pd.to_datetime(data['storetime'], errors='coerce')

In [5]:
data.drop(columns=['secondaryordercategoryname'], inplace=True)

In [6]:
data['rate'].fillna(data['rate'].median(), inplace=True)
data['rateuom'].fillna('unknown', inplace=True)


In [7]:
data['totalamount'].fillna(0, inplace=True)
data['totalamountuom'].fillna('unknown', inplace=True)

In [8]:
# Calculate the number of null values in each column
null_values = data.isnull().sum()

# Display the total number of columns and rows
total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 25
Total rows: 20404
-------------
subject_id                       0
hadm_id                          0
stay_id                          0
caregiver_id                     0
starttime                        0
endtime                          0
storetime                        0
itemid                           0
amount                           0
amountuom                        0
rate                             0
rateuom                          0
orderid                          0
linkorderid                      0
ordercategoryname                0
ordercomponenttypedescription    0
ordercategorydescription         0
patientweight                    0
totalamount                      0
totalamountuom                   0
isopenbag                        0
continueinnextdept               0
statusdescription                0
originalamount                   0
originalrate                     0
dtype: int64


In [9]:
data.dtypes

subject_id                                int64
hadm_id                                   int64
stay_id                                   int64
caregiver_id                              int64
starttime                        datetime64[ns]
endtime                          datetime64[ns]
storetime                        datetime64[ns]
itemid                                    int64
amount                                  float64
amountuom                                object
rate                                    float64
rateuom                                  object
orderid                                   int64
linkorderid                               int64
ordercategoryname                        object
ordercomponenttypedescription            object
ordercategorydescription                 object
patientweight                           float64
totalamount                             float64
totalamountuom                           object
isopenbag                               

In [10]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import time

In [11]:
df_data = data.head(2000)


In [12]:
# Define the metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_data)


In [13]:
# Initialize the CTGANSynthesizer
custom_synthesizer = CTGANSynthesizer(metadata, epochs=1000)

# Start timing
start_time = time.time()

# Train the synthesizer on the real data
custom_synthesizer.fit(df_data)

# End timing
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Training time: {elapsed_time} seconds")


Training time: 995.6860482692719 seconds


In [14]:
# Generate synthetic data
synthetic_data = custom_synthesizer.sample(1000)  # Generate 1000 rows of synthetic data

# Display the first few rows of synthetic data
print(synthetic_data.head())

   subject_id   hadm_id   stay_id  caregiver_id           starttime  \
0    10005610  28901884  34935706         30687 2126-03-06 19:43:30   
1    10005348  22072354  34800351          7738 2131-11-03 17:57:23   
2    10015840  21737367  34934312         87507 2179-05-07 11:38:56   
3    10022362  20183057  35349005         19319 2117-03-30 11:32:13   
4    10009749  28672713  35200436         52284 2151-07-18 10:54:05   

              endtime           storetime  itemid      amount amountuom  ...  \
0 2112-03-19 01:15:16 2185-12-06 08:27:03  225889  312.062346        ml  ...   
1 2132-04-24 07:09:30 2114-11-09 15:08:23  225054   34.461984        ml  ...   
2 2178-10-22 09:28:34 2178-05-07 02:32:20  223280   17.969414     units  ...   
3 2115-09-30 22:38:20 2112-10-31 12:56:46  223262   23.717545     units  ...   
4 2150-01-26 07:55:17 2149-09-21 14:42:22  225229   55.480863        ml  ...   

   ordercomponenttypedescription ordercategorydescription  patientweight  \
0           Main

In [15]:
# Update the 'subject_id' to start from 400000
synthetic_data['subject_id'] = range(400000, 400000 + len(synthetic_data))
synthetic_data['hadm_id'] = synthetic_data['subject_id'].apply(lambda x: f"{x}{str(synthetic_data.index.get_loc(synthetic_data.index[synthetic_data['subject_id'] == x][0])).zfill(2)}")



In [16]:
synthetic_data.to_csv('synthetic_inputevents.csv', index=False)

print("Synthetic data generated, updated, and saved successfully.")

Synthetic data generated, updated, and saved successfully.


In [17]:
# Calculate the number of null values in each column
null_values = synthetic_data.isnull().sum()

# Display the total number of columns and rows
total_columns = synthetic_data.shape[1]
total_rows = synthetic_data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 25
Total rows: 1000
-------------
subject_id                       0
hadm_id                          0
stay_id                          0
caregiver_id                     0
starttime                        0
endtime                          0
storetime                        0
itemid                           0
amount                           0
amountuom                        0
rate                             0
rateuom                          0
orderid                          0
linkorderid                      0
ordercategoryname                0
ordercomponenttypedescription    0
ordercategorydescription         0
patientweight                    0
totalamount                      0
totalamountuom                   0
isopenbag                        0
continueinnextdept               0
statusdescription                0
originalamount                   0
originalrate                     0
dtype: int64


In [21]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata
)


Generating report ...

(1/2) Evaluating Data Validity: |████████████████████████████████████████████████████| 25/25 [00:00<00:00, 218.41it/s]|
Data Validity Score: 95.83%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 199.25it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 97.92%



In [22]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    df_data,
    synthetic_data,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |█████████████████████████████████████████████████████| 25/25 [00:00<00:00, 94.73it/s]|
Column Shapes Score: 84.28%

(2/2) Evaluating Column Pair Trends: |██████████████████████████████████████████████| 300/300 [00:07<00:00, 39.41it/s]|
Column Pair Trends Score: 88.23%

Overall Score (Average): 86.25%



In [23]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score,Error
0,subject_id,KSComplement,0.0,
1,hadm_id,KSComplement,,TypeError: '<' not supported between instances...
2,stay_id,KSComplement,0.866,
3,caregiver_id,KSComplement,0.8375,
4,starttime,KSComplement,0.8375,
5,endtime,KSComplement,0.9205,
6,storetime,KSComplement,0.9035,
7,itemid,KSComplement,0.8605,
8,amount,KSComplement,0.7995,
9,amountuom,TVComplement,0.9335,


In [25]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df_data,
    synthetic_data=synthetic_data,
    column_name='ordercategoryname',
    metadata=metadata
)

fig.show()

In [26]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df_data,
    synthetic_data=synthetic_data,
    column_name='ordercategorydescription',
    metadata=metadata
)

fig.show()

In [27]:
import pandas as pd

# Load the provided data
data = [
    {'Column': 'subject_id', 'Metric': 'KSComplement', 'Score': 0.0, 'Error': None},
    {'Column': 'hadm_id', 'Metric': 'KSComplement', 'Score': None, 'Error': "TypeError: '<' not supported between instances"},
    {'Column': 'stay_id', 'Metric': 'KSComplement', 'Score': 0.8660, 'Error': None},
    {'Column': 'caregiver_id', 'Metric': 'KSComplement', 'Score': 0.8375, 'Error': None},
    {'Column': 'starttime', 'Metric': 'KSComplement', 'Score': 0.8375, 'Error': None},
    {'Column': 'endtime', 'Metric': 'KSComplement', 'Score': 0.9205, 'Error': None},
    {'Column': 'storetime', 'Metric': 'KSComplement', 'Score': 0.9035, 'Error': None},
    {'Column': 'itemid', 'Metric': 'KSComplement', 'Score': 0.8605, 'Error': None},
    {'Column': 'amount', 'Metric': 'KSComplement', 'Score': 0.7995, 'Error': None},
    {'Column': 'amountuom', 'Metric': 'TVComplement', 'Score': 0.9335, 'Error': None},
    {'Column': 'rate', 'Metric': 'KSComplement', 'Score': 0.7695, 'Error': None},
    {'Column': 'rateuom', 'Metric': 'TVComplement', 'Score': 0.9140, 'Error': None},
    {'Column': 'orderid', 'Metric': 'KSComplement', 'Score': 0.8075, 'Error': None},
    {'Column': 'linkorderid', 'Metric': 'KSComplement', 'Score': 0.8215, 'Error': None},
    {'Column': 'ordercategoryname', 'Metric': 'TVComplement', 'Score': 0.9305, 'Error': None},
    {'Column': 'ordercomponenttypedescription', 'Metric': 'TVComplement', 'Score': 0.9560, 'Error': None},
    {'Column': 'ordercategorydescription', 'Metric': 'TVComplement', 'Score': 0.9580, 'Error': None},
    {'Column': 'patientweight', 'Metric': 'KSComplement', 'Score': 0.8860, 'Error': None},
    {'Column': 'totalamount', 'Metric': 'KSComplement', 'Score': 0.7810, 'Error': None},
    {'Column': 'totalamountuom', 'Metric': 'TVComplement', 'Score': 0.9645, 'Error': None},
    {'Column': 'isopenbag', 'Metric': 'TVComplement', 'Score': 1.0000, 'Error': None},
    {'Column': 'continueinnextdept', 'Metric': 'TVComplement', 'Score': 1.0000, 'Error': None},
    {'Column': 'statusdescription', 'Metric': 'TVComplement', 'Score': 0.9270, 'Error': None},
    {'Column': 'originalamount', 'Metric': 'KSComplement', 'Score': 0.6860, 'Error': None},
    {'Column': 'originalrate', 'Metric': 'KSComplement', 'Score': 0.8675, 'Error': None},
]

# Create a DataFrame from the provided data
df = pd.DataFrame(data)

# Define the thresholds
thresholds = {
    'KSComplement': 0.7,
    'TVComplement': 0.7,
    'critical_columns': {
        'admittime': 0.8,
        'dischtime': 0.8,
        'admission_type': 0.9,
        'admission_location': 0.9,
        'discharge_location': 0.9,
        'insurance': 0.9,
        'marital_status': 0.9,
    },
    'non_critical_columns': {
        'subject_id': 0.0,
        'language': 0.0,
        'race': 0.0,
        'edregtime': 0.7,
        'edouttime': 0.7,
    }
}

# Function to check if columns meet the threshold
def check_thresholds(df, thresholds):
    results = []
    for _, row in df.iterrows():
        column_name = row['Column']
        metric = row['Metric']
        score = row['Score']

        if column_name in thresholds['critical_columns']:
            threshold = thresholds['critical_columns'][column_name]
        else:
            threshold = thresholds['non_critical_columns'].get(column_name, thresholds.get(metric, 0))

        if pd.isna(score):
            results.append(f"{column_name} does not have a valid score. Error: {row['Error']}")
        elif score >= threshold:
            results.append(f"{column_name} meets the threshold with a score of {score:.2e}.")
        else:
            results.append(f"{column_name} does not meet the threshold. Score: {score:.2e}, Threshold: {threshold:.2e}")
    
    return results

# Call the function with the data
results = check_thresholds(df, thresholds)

# Print the results
for result in results:
    print(result)


subject_id meets the threshold with a score of 0.00e+00.
hadm_id does not have a valid score. Error: TypeError: '<' not supported between instances
stay_id meets the threshold with a score of 8.66e-01.
caregiver_id meets the threshold with a score of 8.38e-01.
starttime meets the threshold with a score of 8.38e-01.
endtime meets the threshold with a score of 9.20e-01.
storetime meets the threshold with a score of 9.03e-01.
itemid meets the threshold with a score of 8.61e-01.
amount meets the threshold with a score of 7.99e-01.
amountuom meets the threshold with a score of 9.33e-01.
rate meets the threshold with a score of 7.69e-01.
rateuom meets the threshold with a score of 9.14e-01.
orderid meets the threshold with a score of 8.07e-01.
linkorderid meets the threshold with a score of 8.22e-01.
ordercategoryname meets the threshold with a score of 9.30e-01.
ordercomponenttypedescription meets the threshold with a score of 9.56e-01.
ordercategorydescription meets the threshold with a sc