In [14]:
import pandas as pd
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Load the data
file_path = r"C:\Users\Lenovo\OneDrive - University of Leeds\Project\mimic-iv-clinical-database-demo-2.2\mimic-iv-clinical-database-demo-2.2\labevents.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   labevent_id  subject_id     hadm_id  specimen_id  itemid order_provider_id  \
0       172061    10014354  29600294.0      1808066   51277               NaN   
1       172062    10014354  29600294.0      1808066   51279               NaN   
2       172068    10014354  29600294.0      1808066   52172               NaN   
3       172063    10014354  29600294.0      1808066   51301               NaN   
4       172050    10014354  29600294.0      1808066   51249               NaN   

             charttime            storetime value  valuenum valueuom  \
0  2148-08-16 00:00:00  2148-08-16 01:30:00  15.4     15.40        %   
1  2148-08-16 00:00:00  2148-08-16 01:30:00  3.35      3.35     m/uL   
2  2148-08-16 00:00:00  2148-08-16 01:30:00  49.7     49.70       fL   
3  2148-08-16 00:00:00  2148-08-16 01:30:00  20.3     20.30     K/uL   
4  2148-08-16 00:00:00  2148-08-16 01:30:00  31.1     31.10     g/dL   

   ref_range_lower  ref_range_upper      flag pri

In [15]:
# Calculate the number of null values in each column
null_values = data.isnull().sum()

# Display the total number of columns and rows
total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 16
Total rows: 107727
-------------
labevent_id              0
subject_id               0
hadm_id              28420
specimen_id              0
itemid                   0
order_provider_id    90897
charttime                0
storetime              992
value                 9588
valuenum             12481
valueuom             16203
ref_range_lower      18728
ref_range_upper      18728
flag                 67452
priority              9329
comments             89273
dtype: int64


In [16]:
data.dtypes

labevent_id            int64
subject_id             int64
hadm_id              float64
specimen_id            int64
itemid                 int64
order_provider_id     object
charttime             object
storetime             object
value                 object
valuenum             float64
valueuom              object
ref_range_lower      float64
ref_range_upper      float64
flag                  object
priority              object
comments              object
dtype: object

In [17]:
data['charttime'] = pd.to_datetime(data['charttime'])

In [18]:
data['storetime'] = pd.to_datetime(data['storetime'])

In [19]:
data.dtypes

labevent_id                   int64
subject_id                    int64
hadm_id                     float64
specimen_id                   int64
itemid                        int64
order_provider_id            object
charttime            datetime64[ns]
storetime            datetime64[ns]
value                        object
valuenum                    float64
valueuom                     object
ref_range_lower             float64
ref_range_upper             float64
flag                         object
priority                     object
comments                     object
dtype: object

In [20]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import time

In [21]:
df_data = data.head(2000)


In [22]:
# Define the metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_data)


In [23]:
# Initialize the CTGANSynthesizer
custom_synthesizer = CTGANSynthesizer(metadata, epochs=1000)

# Start timing
start_time = time.time()

# Train the synthesizer on the real data
custom_synthesizer.fit(df_data)

# End timing
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Training time: {elapsed_time} seconds")



Training time: 494.44371914863586 seconds


In [24]:
# Generate synthetic data
synthetic_data = custom_synthesizer.sample(1000)  # Generate 1000 rows of synthetic data

# Display the first few rows of synthetic data
print(synthetic_data.head())

   labevent_id  subject_id     hadm_id  specimen_id  itemid order_provider_id  \
0    883134354    10038959  28523098.0       629482   51554               NaN   
1    531875589    10020575  24852670.0     51033801   51286               NaN   
2    306336250    10021242  24844635.0     48309049   52117            P89ZCW   
3    558234971    10035673         NaN     85201794   51247            P82TOD   
4    916898341    10005663  25001306.0     52886273   50918               NaN   

            charttime           storetime          value  valuenum valueuom  \
0 2185-03-22 11:07:20 2150-07-21 03:32:14            NaN    42.165    mEq/L   
1 2183-04-28 11:45:01 2184-10-28 21:00:23  sdv-pii-gae51     5.969       fL   
2 2162-02-12 03:42:04 2149-01-07 03:12:00  sdv-pii-y1y9g     1.828    mg/dL   
3 2117-08-19 16:16:11 2117-08-15 11:02:03  sdv-pii-p0uzv    44.379      NaN   
4 2150-12-21 23:47:40 2149-09-14 04:12:44  sdv-pii-im7f2     1.602     IU/L   

   ref_range_lower  ref_range_upper fl

In [25]:
# Update the 'subject_id' to start from 400000
synthetic_data['subject_id'] = range(400000, 400000 + len(synthetic_data))
synthetic_data['hadm_id'] = synthetic_data['subject_id'].apply(lambda x: f"{x}{str(synthetic_data.index.get_loc(synthetic_data.index[synthetic_data['subject_id'] == x][0])).zfill(2)}")



In [26]:
synthetic_data.to_csv('synthetic_labevents.csv', index=False)

print("Synthetic data generated, updated, and saved successfully.")

Synthetic data generated, updated, and saved successfully.


In [27]:
# Calculate the number of null values in each column
null_values = synthetic_data.isnull().sum()

# Display the total number of columns and rows
total_columns = synthetic_data.shape[1]
total_rows = synthetic_data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

# Display the null values in each column
print(null_values)


Total columns: 16
Total rows: 1000
-------------
labevent_id            0
subject_id             0
hadm_id                0
specimen_id            0
itemid                 0
order_provider_id    749
charttime              0
storetime              3
value                102
valuenum             108
valueuom             234
ref_range_lower      167
ref_range_upper      150
flag                 557
priority              47
comments             704
dtype: int64


In [28]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=df_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)


Generating report ...

(1/2) Evaluating Data Validity: |████████████████████████████████████████████████████| 16/16 [00:00<00:00, 752.68it/s]|
Data Validity Score: 92.86%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 246.90it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 96.43%



In [29]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    df_data,
    synthetic_data,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |████████████████████████████████████████████████████| 16/16 [00:00<00:00, 280.16it/s]|
Column Shapes Score: 77.08%

(2/2) Evaluating Column Pair Trends: |██████████████████████████████████████████████| 120/120 [00:01<00:00, 86.68it/s]|
Column Pair Trends Score: 79.92%

Overall Score (Average): 78.5%



In [30]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score,Error
0,subject_id,KSComplement,0.0,
1,hadm_id,KSComplement,,TypeError: '<' not supported between instances...
2,specimen_id,KSComplement,0.909,
3,itemid,KSComplement,0.862,
4,order_provider_id,TVComplement,0.871572,
5,charttime,KSComplement,0.8615,
6,storetime,KSComplement,0.860782,
7,valuenum,KSComplement,0.776132,
8,valueuom,TVComplement,0.861771,
9,ref_range_lower,KSComplement,0.748837,


In [32]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df_data,
    synthetic_data=synthetic_data,
    column_name='valuenum',
    metadata=metadata
)

fig.show()

In [35]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df_data,
    synthetic_data=synthetic_data,
    column_name='ref_range_upper',
    metadata=metadata
)

fig.show()

In [36]:
import pandas as pd

# Load the provided data
data = [
    {'Column': 'subject_id', 'Metric': 'KSComplement', 'Score': 0.000000},
    {'Column': 'hadm_id', 'Metric': 'KSComplement', 'Score': None, 'Error': "TypeError: '<' not supported between instances"},
    {'Column': 'specimen_id', 'Metric': 'KSComplement', 'Score': 0.909000},
    {'Column': 'itemid', 'Metric': 'KSComplement', 'Score': 0.862000},
    {'Column': 'order_provider_id', 'Metric': 'TVComplement', 'Score': 0.871572},
    {'Column': 'charttime', 'Metric': 'KSComplement', 'Score': 0.861500},
    {'Column': 'storetime', 'Metric': 'KSComplement', 'Score': 0.860782},
    {'Column': 'valuenum', 'Metric': 'KSComplement', 'Score': 0.776132},
    {'Column': 'valueuom', 'Metric': 'TVComplement', 'Score': 0.861771},
    {'Column': 'ref_range_lower', 'Metric': 'KSComplement', 'Score': 0.748837},
    {'Column': 'ref_range_upper', 'Metric': 'KSComplement', 'Score': 0.698618},
    {'Column': 'flag', 'Metric': 'TVComplement', 'Score': 1.000000},
    {'Column': 'priority', 'Metric': 'TVComplement', 'Score': 0.927246},
    {'Column': 'comments', 'Metric': 'TVComplement', 'Score': 0.642692}
]

# Create a DataFrame from the provided data
df = pd.DataFrame(data)

# Define the thresholds
thresholds = {
    'KSComplement': 0.7,
    'TVComplement': 0.7,
    'critical_columns': {
        'admittime': 0.8,
        'dischtime': 0.8,
        'admission_type': 0.9,
        'admission_location': 0.9,
        'discharge_location': 0.9,
        'insurance': 0.9,
        'marital_status': 0.9,
    },
    'non_critical_columns': {
        'subject_id': 0.0,
        'language': 0.0,
        'race': 0.0,
        'edregtime': 0.7,
        'edouttime': 0.7,
    }
}

# Function to check if columns meet the threshold
def check_thresholds(df, thresholds):
    results = []
    for _, row in df.iterrows():
        column_name = row['Column']
        metric = row['Metric']
        score = row['Score']

        if column_name in thresholds['critical_columns']:
            threshold = thresholds['critical_columns'][column_name]
        else:
            threshold = thresholds['non_critical_columns'].get(column_name, thresholds.get(metric, 0))

        if pd.isna(score):
            results.append(f"{column_name} does not have a valid score. Error: {row['Error']}")
        elif score >= threshold:
            results.append(f"{column_name} meets the threshold with a score of {score:.2e}.")
        else:
            results.append(f"{column_name} does not meet the threshold. Score: {score:.2e}, Threshold: {threshold:.2e}")
    
    return results

# Call the function with the data
results = check_thresholds(df, thresholds)

# Print the results
for result in results:
    print(result)


subject_id meets the threshold with a score of 0.00e+00.
hadm_id does not have a valid score. Error: TypeError: '<' not supported between instances
specimen_id meets the threshold with a score of 9.09e-01.
itemid meets the threshold with a score of 8.62e-01.
order_provider_id meets the threshold with a score of 8.72e-01.
charttime meets the threshold with a score of 8.62e-01.
storetime meets the threshold with a score of 8.61e-01.
valuenum meets the threshold with a score of 7.76e-01.
valueuom meets the threshold with a score of 8.62e-01.
ref_range_lower meets the threshold with a score of 7.49e-01.
ref_range_upper does not meet the threshold. Score: 6.99e-01, Threshold: 7.00e-01
flag meets the threshold with a score of 1.00e+00.
priority meets the threshold with a score of 9.27e-01.
comments does not meet the threshold. Score: 6.43e-01, Threshold: 7.00e-01
