In [1]:
import pandas as pd
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Load the data
file_path = r"C:\Users\Lenovo\OneDrive - University of Leeds\Project\mimic-iv-clinical-database-demo-2.2\mimic-iv-clinical-database-demo-2.2\outputevents.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   subject_id   hadm_id   stay_id  caregiver_id            charttime  \
0    10002428  23473524  35479615         29441  2156-05-15 18:00:00   
1    10002428  23473524  35479615         29441  2156-05-15 12:00:00   
2    10002428  23473524  35479615         29441  2156-05-15 13:00:00   
3    10002428  23473524  35479615         29441  2156-05-15 08:00:00   
4    10002428  23473524  35479615         29441  2156-05-15 14:00:00   

             storetime  itemid  value valueuom  
0  2156-05-15 17:42:00  226583    600       ml  
1  2156-05-15 12:08:00  226559     60       ml  
2  2156-05-15 13:00:00  226559     45       ml  
3  2156-05-15 08:39:00  226559    125       ml  
4  2156-05-15 13:56:00  226559     60       ml  


In [2]:
# Calculate the number of null values in each column
null_values = data.isnull().sum()

# Display the total number of columns and rows
total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 9
Total rows: 9362
-------------
subject_id      0
hadm_id         0
stay_id         0
caregiver_id    0
charttime       0
storetime       0
itemid          0
value           0
valueuom        0
dtype: int64


In [3]:
data.dtypes

subject_id       int64
hadm_id          int64
stay_id          int64
caregiver_id     int64
charttime       object
storetime       object
itemid           int64
value            int64
valueuom        object
dtype: object

In [4]:
data['charttime'] = pd.to_datetime(data['charttime'], errors='coerce')
data['storetime'] = pd.to_datetime(data['storetime'], errors='coerce')

In [5]:
data.dtypes

subject_id               int64
hadm_id                  int64
stay_id                  int64
caregiver_id             int64
charttime       datetime64[ns]
storetime       datetime64[ns]
itemid                   int64
value                    int64
valueuom                object
dtype: object

In [6]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import time

In [7]:
df_data = data.head(2000)


In [8]:
# Define the metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_data)


In [9]:
# Initialize the CTGANSynthesizer
custom_synthesizer = CTGANSynthesizer(metadata, epochs=1000)

# Start timing
start_time = time.time()

# Train the synthesizer on the real data
custom_synthesizer.fit(df_data)

# End timing
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Training time: {elapsed_time} seconds")



Training time: 530.9363780021667 seconds


In [10]:
# Generate synthetic data
synthetic_data = custom_synthesizer.sample(1000)  # Generate 1000 rows of synthetic data

# Display the first few rows of synthetic data
print(synthetic_data.head())

   subject_id   hadm_id   stay_id  caregiver_id           charttime  \
0    10031120  27544731  32375292         25603 2201-12-13 06:00:00   
1    10026993  28690972  32526900         68570 2201-12-13 06:00:00   
2    10005849  22681533  35461496         15564 2146-08-31 03:48:05   
3    10027261  27695598  32668617          2826 2191-12-09 13:02:49   
4    10030321  27352183  35078296         26669 2201-12-13 06:00:00   

            storetime  itemid  value valueuom  
0 2201-12-13 05:49:00  226559    149       ml  
1 2149-06-05 14:15:07  226561     45       ml  
2 2150-10-09 18:20:31  226565     55       ml  
3 2192-05-14 08:22:50  226562     48       ml  
4 2201-05-13 07:32:24  226559      0       ml  


In [11]:
# Update the 'subject_id' to start from 400000
synthetic_data['subject_id'] = range(400000, 400000 + len(synthetic_data))
synthetic_data['hadm_id'] = synthetic_data['subject_id'].apply(lambda x: f"{x}{str(synthetic_data.index.get_loc(synthetic_data.index[synthetic_data['subject_id'] == x][0])).zfill(2)}")



In [12]:
synthetic_data.to_csv('synthetic_outputevents.csv', index=False)

print("Synthetic data generated, updated, and saved successfully.")

Synthetic data generated, updated, and saved successfully.


In [13]:
# Calculate the number of null values in each column
null_values = synthetic_data.isnull().sum()

# Display the total number of columns and rows
total_columns = synthetic_data.shape[1]
total_rows = synthetic_data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 9
Total rows: 1000
-------------
subject_id      0
hadm_id         0
stay_id         0
caregiver_id    0
charttime       0
storetime       0
itemid          0
value           0
valueuom        0
dtype: int64


In [14]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=df_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)


Generating report ...

(1/2) Evaluating Data Validity: |██████████████████████████████████████████████████████| 9/9 [00:00<00:00, 470.08it/s]|
Data Validity Score: 87.5%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 143.13it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 93.75%



In [15]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    df_data,
    synthetic_data,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████████████████████████████████████████████████| 9/9 [00:00<00:00, 100.32it/s]|
Column Shapes Score: 76.23%

(2/2) Evaluating Column Pair Trends: |████████████████████████████████████████████████| 36/36 [00:00<00:00, 46.73it/s]|
Column Pair Trends Score: 92.21%

Overall Score (Average): 84.22%



In [16]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score,Error
0,subject_id,KSComplement,0.0,
1,hadm_id,KSComplement,,TypeError: '<' not supported between instances...
2,stay_id,KSComplement,0.9005,
3,caregiver_id,KSComplement,0.903,
4,charttime,KSComplement,0.9155,
5,storetime,KSComplement,0.8865,
6,itemid,KSComplement,0.5815,
7,value,KSComplement,0.911,
8,valueuom,TVComplement,1.0,


In [17]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=data,
    synthetic_data=synthetic_data,
    column_name='value',
    metadata=metadata
)

fig.show()

In [20]:
import pandas as pd

# Load the provided data
data = [
    {'Column': 'subject_id', 'Metric': 'KSComplement', 'Score': 0.0, 'Error': None},
    {'Column': 'hadm_id', 'Metric': 'KSComplement', 'Score': None, 'Error': "TypeError: '<' not supported between instances"},
    {'Column': 'stay_id', 'Metric': 'KSComplement', 'Score': 0.9005, 'Error': None},
    {'Column': 'caregiver_id', 'Metric': 'KSComplement', 'Score': 0.9030, 'Error': None},
    {'Column': 'charttime', 'Metric': 'KSComplement', 'Score': 0.9155, 'Error': None},
    {'Column': 'storetime', 'Metric': 'KSComplement', 'Score': 0.8865, 'Error': None},
    {'Column': 'itemid', 'Metric': 'KSComplement', 'Score': 0.5815, 'Error': None},
    {'Column': 'value', 'Metric': 'KSComplement', 'Score': 0.9110, 'Error': None},
    {'Column': 'valueuom', 'Metric': 'TVComplement', 'Score': 1.0000, 'Error': None}
]

# Create a DataFrame from the provided data
df = pd.DataFrame(data)

# Define the thresholds
thresholds = {
    'KSComplement': 0.7,
    'TVComplement': 0.7,
    'critical_columns': {
        'admittime': 0.8,
        'dischtime': 0.8,
        'admission_type': 0.9,
        'admission_location': 0.9,
        'discharge_location': 0.9,
        'insurance': 0.9,
        'marital_status': 0.9,
    },
    'non_critical_columns': {
        'subject_id': 0.0,
        'language': 0.0,
        'race': 0.0,
        'edregtime': 0.7,
        'edouttime': 0.7,
    }
}

# Function to check if columns meet the threshold
def check_thresholds(df, thresholds):
    results = []
    for _, row in df.iterrows():
        column_name = row['Column']
        metric = row['Metric']
        score = row['Score']

        if column_name in thresholds['critical_columns']:
            threshold = thresholds['critical_columns'][column_name]
        else:
            threshold = thresholds['non_critical_columns'].get(column_name, thresholds.get(metric, 0))

        if pd.isna(score):
            results.append(f"{column_name} does not have a valid score. Error: {row['Error']}")
        elif score >= threshold:
            results.append(f"{column_name} meets the threshold with a score of {score:.2e}.")
        else:
            results.append(f"{column_name} does not meet the threshold. Score: {score:.2e}, Threshold: {threshold:.2e}")
    
    return results

# Call the function with the data
results = check_thresholds(df, thresholds)

# Print the results
for result in results:
    print(result)


subject_id meets the threshold with a score of 0.00e+00.
hadm_id does not have a valid score. Error: TypeError: '<' not supported between instances
stay_id meets the threshold with a score of 9.00e-01.
caregiver_id meets the threshold with a score of 9.03e-01.
charttime meets the threshold with a score of 9.15e-01.
storetime meets the threshold with a score of 8.86e-01.
itemid does not meet the threshold. Score: 5.82e-01, Threshold: 7.00e-01
value meets the threshold with a score of 9.11e-01.
valueuom meets the threshold with a score of 1.00e+00.
