In [25]:
import pandas as pd
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Load the data
file_path = r"D:\coursework\Main Project\mimic-iv-2.2\mimic-iv-2.2\icustays.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   subject_id   hadm_id   stay_id  \
0    10000032  29079034  39553978   
1    10000980  26913865  39765666   
2    10001217  24597018  37067082   
3    10001217  27703517  34592300   
4    10001725  25563031  31205490   

                                     first_careunit  \
0                Medical Intensive Care Unit (MICU)   
1                Medical Intensive Care Unit (MICU)   
2               Surgical Intensive Care Unit (SICU)   
3               Surgical Intensive Care Unit (SICU)   
4  Medical/Surgical Intensive Care Unit (MICU/SICU)   

                                      last_careunit               intime  \
0                Medical Intensive Care Unit (MICU)  2180-07-23 14:00:00   
1                Medical Intensive Care Unit (MICU)  2189-06-27 08:42:00   
2               Surgical Intensive Care Unit (SICU)  2157-11-20 19:18:02   
3               Surgical Intensive Care Unit (SICU)  2157-12-19 15:42:24   
4  Medical/Surgical Intensive Care 

In [26]:
missing_values = data.isnull().sum()
missing_values


subject_id        0
hadm_id           0
stay_id           0
first_careunit    0
last_careunit     0
intime            0
outtime           0
los               0
dtype: int64

In [27]:
data.dtypes

subject_id          int64
hadm_id             int64
stay_id             int64
first_careunit     object
last_careunit      object
intime             object
outtime            object
los               float64
dtype: object

In [28]:
data['intime'] = pd.to_datetime(data['intime'])
data['outtime'] = pd.to_datetime(data['outtime'])

In [29]:
data.dtypes

subject_id                 int64
hadm_id                    int64
stay_id                    int64
first_careunit            object
last_careunit             object
intime            datetime64[ns]
outtime           datetime64[ns]
los                      float64
dtype: object

In [30]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import time

In [31]:
df_data = data.head(2000)


In [32]:
# Define the metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_data)


In [33]:
# Initialize the CTGANSynthesizer
custom_synthesizer = CTGANSynthesizer(metadata, epochs=1000)

# Start timing
start_time = time.time()

# Train the synthesizer on the real data
custom_synthesizer.fit(df_data)

# End timing
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Training time: {elapsed_time} seconds")



Training time: 286.9878821372986 seconds


In [41]:
# Generate synthetic data
synthetic_data = custom_synthesizer.sample(1000)  # Generate 1000 rows of synthetic data

# Display the first few rows of synthetic data
print(synthetic_data.head())

   subject_id   hadm_id    stay_id  \
0    10044823  27208906  284992628   
1    10081871  27068547  617653576   
2    10248415  27547578  477559727   
3    10047893  20285494   64786691   
4    10024181  28155571  254917403   

                                     first_careunit  \
0                               Trauma SICU (TSICU)   
1                               Trauma SICU (TSICU)   
2                               Trauma SICU (TSICU)   
3                          Coronary Care Unit (CCU)   
4  Medical/Surgical Intensive Care Unit (MICU/SICU)   

                                      last_careunit              intime  \
0  Medical/Surgical Intensive Care Unit (MICU/SICU) 2159-04-18 10:55:55   
1                               Trauma SICU (TSICU) 2175-12-11 12:08:52   
2                               Trauma SICU (TSICU) 2142-08-31 00:56:01   
3                          Coronary Care Unit (CCU) 2177-01-01 17:54:35   
4  Medical/Surgical Intensive Care Unit (MICU/SICU) 2152-05-14 17

In [42]:
# Update the 'subject_id' to start from 400000
synthetic_data['subject_id'] = range(400000, 400000 + len(synthetic_data))
synthetic_data['hadm_id'] = synthetic_data['subject_id'].apply(lambda x: f"{x}{str(synthetic_data.index.get_loc(synthetic_data.index[synthetic_data['subject_id'] == x][0])).zfill(2)}")



In [43]:
synthetic_data.to_csv('synthetic_icu_events.csv', index=False)

print("Synthetic data generated, updated, and saved successfully.")

Synthetic data generated, updated, and saved successfully.


In [44]:
# Calculate the number of null values in each column
null_values = synthetic_data.isnull().sum()

# Display the total number of columns and rows
total_columns = synthetic_data.shape[1]
total_rows = synthetic_data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 8
Total rows: 1000
-------------
subject_id        0
hadm_id           0
stay_id           0
first_careunit    0
last_careunit     0
intime            0
outtime           0
los               0
dtype: int64


In [45]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata
)


Generating report ...


|                                                                                               | 0/8 [00:00<?, ?it/s]|[A
(1/2) Evaluating Data Validity: |██████████████████████████████████████████████████████| 8/8 [00:00<00:00, 164.07it/s]|[A
Data Validity Score: 85.71%


|                                                                                               | 0/1 [00:00<?, ?it/s]|[A
(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 249.72it/s]|[A
Data Structure Score: 100.0%

Overall Score (Average): 92.86%



In [47]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    data,
    synthetic_data,
    metadata
)

Generating report ...


|                                                                                               | 0/8 [00:00<?, ?it/s]|[A
(1/2) Evaluating Column Shapes: |                                                               | 0/8 [00:00<?, ?it/s]|[A
(1/2) Evaluating Column Shapes: |███████████████████████████████████████████████████████| 8/8 [00:00<00:00, 55.98it/s]|[A
Column Shapes Score: 72.45%


|                                                                                              | 0/28 [00:00<?, ?it/s]|[A
(2/2) Evaluating Column Pair Trends: |                                                         | 0/28 [00:00<?, ?it/s]|[A
(2/2) Evaluating Column Pair Trends: |███████                                          | 4/28 [00:00<00:00, 28.71it/s]|[A
(2/2) Evaluating Column Pair Trends: |████████████████████████████████▌               | 19/28 [00:00<00:00, 79.07it/s]|[A
(2/2) Evaluating Column Pair Trends: |███████████████████████████████████████████████

In [48]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score,Error
0,subject_id,KSComplement,0.0,
1,hadm_id,KSComplement,,TypeError: '<' not supported between instances...
2,first_careunit,TVComplement,0.819987,
3,last_careunit,TVComplement,0.810145,
4,intime,KSComplement,0.921659,
5,outtime,KSComplement,0.917455,
6,los,KSComplement,0.877598,


In [49]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=data,
    synthetic_data=synthetic_data,
    column_name='last_careunit',
    metadata=metadata
)

fig.show()

In [50]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=data,
    synthetic_data=synthetic_data,
    column_name='first_careunit',
    metadata=metadata
)

fig.show()

In [51]:
import pandas as pd

# Load the new provided data
updated_data = [
    {'Column': 'subject_id', 'Metric': 'KSComplement', 'Score': 0.0, 'Error': 'None'},
    {'Column': 'hadm_id', 'Metric': 'KSComplement', 'Score': None, 'Error': "TypeError: '<' not supported between instances"},
    {'Column': 'first_careunit', 'Metric': 'TVComplement', 'Score': 0.819987, 'Error': 'None'},
    {'Column': 'last_careunit', 'Metric': 'TVComplement', 'Score': 0.810145, 'Error': 'None'},
    {'Column': 'intime', 'Metric': 'KSComplement', 'Score': 0.921659, 'Error': 'None'},
    {'Column': 'outtime', 'Metric': 'KSComplement', 'Score': 0.917455, 'Error': 'None'},
    {'Column': 'los', 'Metric': 'KSComplement', 'Score': 0.877598, 'Error': 'None'}
]

# Define the updated thresholds
thresholds_updated = {
    'KSComplement': 0.7,
    'TVComplement': 0.7,
    'critical_columns': {
        'first_careunit': 0.8,
        'last_careunit': 0.8,
        'intime': 0.9,
        'outtime': 0.9,
        'los': 0.8,
    },
    'non_critical_columns': {
        'subject_id': 0.0,
        'hadm_id': 0.0,
    }
}

# Function to check if columns meet the threshold
def check_thresholds_updated(quality_report_details, thresholds):
    for column_detail in quality_report_details:
        column_name = column_detail['Column']
        metric = column_detail['Metric']
        score = column_detail['Score']

        if column_name in thresholds['critical_columns']:
            threshold = thresholds['critical_columns'][column_name]
        else:
            threshold = thresholds['non_critical_columns'].get(column_name, thresholds[metric])

        if score is None:
            print(f"{column_name} does not have a valid score. Error: {column_detail['Error']}")
        elif score >= threshold:
            print(f"{column_name} meets the threshold with a score of {score:.2e}.")
        else:
            print(f"{column_name} does not meet the threshold. Score: {score:.2e}, Threshold: {threshold:.2e}")

# Call the function with the updated data
check_thresholds_updated(updated_data, thresholds_updated)


subject_id meets the threshold with a score of 0.00e+00.
hadm_id does not have a valid score. Error: TypeError: '<' not supported between instances
first_careunit meets the threshold with a score of 8.20e-01.
last_careunit meets the threshold with a score of 8.10e-01.
intime meets the threshold with a score of 9.22e-01.
outtime meets the threshold with a score of 9.17e-01.
los meets the threshold with a score of 8.78e-01.
