In [7]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

file_path = r"C:\Users\Lenovo\OneDrive - University of Leeds\Project\mimic-iv-clinical-database-demo-2.2\mimic-iv-clinical-database-demo-2.2\chartevents.csv"
data = pd.read_csv(file_path)

print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   subject_id   hadm_id   stay_id  caregiver_id            charttime  \
0    10005817  20626031  32604416        6770.0  2132-12-16 00:00:00   
1    10005817  20626031  32604416        6770.0  2132-12-16 00:00:00   
2    10005817  20626031  32604416        6770.0  2132-12-16 00:00:00   
3    10005817  20626031  32604416        6770.0  2132-12-16 00:00:00   
4    10005817  20626031  32604416        6770.0  2132-12-16 00:00:00   

0  2132-12-15 23:45:00  225054            On        NaN      NaN      0.0  
1  2132-12-15 23:43:00  223769            100     100.0        %      0.0  
2  2132-12-15 23:47:00  223956  Atrial demand       NaN      NaN      0.0  
3  2132-12-15 23:47:00  224866            Yes       NaN      NaN      0.0  
4  2132-12-15 23:45:00  227341             No       0.0      NaN      0.0  


In [8]:
null_values = data.isnull().sum()

total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

print(null_values)


Total columns: 11
Total rows: 668862
-------------
subject_id           0
hadm_id              0
stay_id              0
caregiver_id     24240
charttime            0
storetime         1159
itemid               0
value            20730
valuenum        411388
valueuom        506291
dtype: int64


In [9]:
data.drop(columns=['valueuom'], inplace=True)


In [10]:
data['caregiver_id'].fillna(-1, inplace=True)  # Example: Fill with -1 or another placeholder
data['warning'].fillna(0, inplace=True) 

In [11]:
null_values = data.isnull().sum()

total_columns = data.shape[1]
total_rows = data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

print(null_values)


Total columns: 10
Total rows: 668862
-------------
subject_id           0
hadm_id              0
stay_id              0
caregiver_id         0
charttime            0
storetime         1159
itemid               0
value            20730
valuenum        411388
dtype: int64


In [12]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import time

In [13]:
df_data = data.head(3000)


In [14]:
# Define the metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_data)


In [15]:
custom_synthesizer = CTGANSynthesizer(metadata, epochs=1000)

start_time = time.time()

custom_synthesizer.fit(df_data)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Training time: {elapsed_time} seconds")



Training time: 2445.5108625888824 seconds


In [16]:
synthetic_data = custom_synthesizer.sample(1000)  # Generate 1000 rows of synthetic data

print(synthetic_data.head())

   subject_id   hadm_id   stay_id  caregiver_id            charttime  \
0    10005817  20626031  32604416        6376.0  2132-12-15 22:58:09   
1    10005817  20626031  32604416       20870.0  2132-12-17 07:14:51   
2    10005817  20626031  32604416        5441.0  2132-12-17 13:41:22   
3    10005817  20626031  32604416        7822.0  2132-12-17 09:50:23   
4    10005817  20626031  32604416       21820.0  2132-12-17 05:56:59   

0  2132-12-16 04:34:46  220045          Dry        NaN      0.0  
1  2132-12-17 07:44:08  224159  Not assessed      37.7      0.0  
2  2132-12-17 10:50:39  224076          36.5       NaN      0.0  
3  2132-12-17 09:59:25  224107        Normal       NaN      0.0  
4  2132-12-17 12:35:14  224062            54       NaN      0.0  


In [17]:
synthetic_data['subject_id'] = range(400000, 400000 + len(synthetic_data))
synthetic_data['hadm_id'] = synthetic_data['subject_id'].apply(lambda x: f"{x}{str(synthetic_data.index.get_loc(synthetic_data.index[synthetic_data['subject_id'] == x][0])).zfill(2)}")



In [18]:
synthetic_data.to_csv('synthetic_chartevents.csv', index=False)

print("Synthetic data generated, updated, and saved successfully.")

Synthetic data generated, updated, and saved successfully.


In [19]:
null_values = synthetic_data.isnull().sum()

total_columns = synthetic_data.shape[1]
total_rows = synthetic_data.shape[0]
print(f"Total columns: {total_columns}")
print(f"Total rows: {total_rows}")
print("-------------")

print(null_values)


Total columns: 10
Total rows: 1000
-------------
subject_id        0
hadm_id           0
stay_id           0
caregiver_id      0
charttime         0
storetime         0
itemid            0
value            10
valuenum        591
dtype: int64


In [21]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata
)


Generating report ...

(1/2) Evaluating Data Validity: |█████████████████████████████████████████████████████| 10/10 [00:00<00:00, 28.53it/s]|
Data Validity Score: 80.0%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 250.36it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 90.0%



In [23]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    df_data,
    synthetic_data,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |█████████████████████████████████████████████████████| 10/10 [00:00<00:00, 95.61it/s]|
Column Shapes Score: 68.98%

(2/2) Evaluating Column Pair Trends: |████████████████████████████████████████████████| 45/45 [00:18<00:00,  2.50it/s]|
Column Pair Trends Score: 48.2%

Overall Score (Average): 58.59%



In [24]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score
0,subject_id,TVComplement,3.333332e-07
1,hadm_id,TVComplement,3.333332e-07
2,stay_id,TVComplement,1.0
3,caregiver_id,KSComplement,0.837
4,charttime,KSComplement,0.9433333
5,storetime,KSComplement,0.9306667
6,itemid,KSComplement,0.918
7,value,TVComplement,0.7161637
8,valuenum,KSComplement,0.6271543
9,warning,TVComplement,0.926


In [26]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df_data,
    synthetic_data=synthetic_data,
    column_name='value',
    metadata=metadata
)

fig.show()

In [27]:
import pandas as pd

data = [
    {'Column': 'subject_id', 'Metric': 'TVComplement', 'Score': 3.333332e-07},
    {'Column': 'hadm_id', 'Metric': 'TVComplement', 'Score': 3.333332e-07},
    {'Column': 'stay_id', 'Metric': 'TVComplement', 'Score': 1.000000e+00},
    {'Column': 'caregiver_id', 'Metric': 'KSComplement', 'Score': 8.370000e-01},
    {'Column': 'charttime', 'Metric': 'KSComplement', 'Score': 9.433333e-01},
    {'Column': 'storetime', 'Metric': 'KSComplement', 'Score': 9.306667e-01},
    {'Column': 'itemid', 'Metric': 'KSComplement', 'Score': 9.180000e-01},
    {'Column': 'value', 'Metric': 'TVComplement', 'Score': 7.161637e-01},
    {'Column': 'valuenum', 'Metric': 'KSComplement', 'Score': 6.271543e-01},
    {'Column': 'warning', 'Metric': 'TVComplement', 'Score': 9.260000e-01}
]

df = pd.DataFrame(data)

thresholds = {
    'KSComplement': 0.7,
    'TVComplement': 0.7,
    'critical_columns': {
        'admittime': 0.8,
        'dischtime': 0.8,
        'admission_type': 0.9,
        'admission_location': 0.9,
        'discharge_location': 0.9,
        'insurance': 0.9,
        'marital_status': 0.9,
    },
    'non_critical_columns': {
        'subject_id': 0.0,
        'language': 0.0,
        'race': 0.0,
        'edregtime': 0.7,
        'edouttime': 0.7,
    }
}

def check_thresholds(df, thresholds):
    results = []
    for _, row in df.iterrows():
        column_name = row['Column']
        metric = row['Metric']
        score = row['Score']

        if column_name in thresholds['critical_columns']:
            threshold = thresholds['critical_columns'][column_name]
        else:
            threshold = thresholds['non_critical_columns'].get(column_name, thresholds.get(metric, 0))

        if pd.isna(score):
            results.append(f"{column_name} does not have a valid score.")
        elif score >= threshold:
            results.append(f"{column_name} meets the threshold with a score of {score:.2e}.")
        else:
            results.append(f"{column_name} does not meet the threshold. Score: {score:.2e}, Threshold: {threshold:.2e}")
    
    return results

results = check_thresholds(df, thresholds)

for result in results:
    print(result)


subject_id meets the threshold with a score of 3.33e-07.
hadm_id does not meet the threshold. Score: 3.33e-07, Threshold: 7.00e-01
stay_id meets the threshold with a score of 1.00e+00.
caregiver_id meets the threshold with a score of 8.37e-01.
charttime meets the threshold with a score of 9.43e-01.
storetime meets the threshold with a score of 9.31e-01.
itemid meets the threshold with a score of 9.18e-01.
value meets the threshold with a score of 7.16e-01.
valuenum does not meet the threshold. Score: 6.27e-01, Threshold: 7.00e-01
