In [None]:
import pandas as pd

# Load datasets
dynamic_data = pd.read_csv('/mnt/data/dynamic.csv')
static_data = pd.read_csv('/mnt/data/[static] id_adm_y.csv')

In [None]:
# Calculate the number of records per patient
records_per_patient = dynamic_data.groupby('id').size()

# Get summary statistics for the distribution of records per patient
distribution_summary = records_per_patient.describe()
distribution_summary, records_per_patient.value_counts().sort_index().head(10)

In [None]:
# Create the 6h time_slot

dynamic_data['charttime'] = pd.to_datetime(dynamic_data['charttime'])
static_data['icu_intime'] = pd.to_datetime(static_data['icu_intime'])

# Merge dynamic data with static data on 'id' to get the ICU admission times
merged_data = pd.merge(dynamic_data, static_data[['id', 'icu_intime']], on='id', how='left')

# Calculate time difference in hours
merged_data['time_diff'] = (merged_data['charttime'] - merged_data['icu_intime']).dt.total_seconds() / 3600

# Define time slots based on time difference
def time_slot(time_diff):
    if time_diff >= 0:
        return int(time_diff // 6) + 1
    else:
        return -(-time_diff // 6)

merged_data['time_slot'] = merged_data['time_diff'].apply(time_slot)

# Select the latest record for each time slot per patient
final_data = merged_data.sort_values(by='charttime').groupby(['id', 'time_slot']).last().reset_index()

# Remove rows with negative 'time_slot' values & above 5 (typo or future ICU admission)
positive_below5_time_slots = final_data[final_data['time_slot'] > 0 & < 5]

In [None]:
# Remove columns with above 30% missingness

# Calculate the fraction of missing values for each column
missing_fraction = positive_below5_time_slots.isnull().mean()

# Remove columns with a fraction of missingness above 30%
filtered_data = positive_below5_time_slots.loc[:, missing_fraction < 0.3]

# Remove the column "charttime"
filtered_data = filtered_data.drop(columns=['charttime'])

In [None]:
# Forward imputation for each individual
# Group the data by 'id' to ensure imputation is done within each individual
df_imputed = df.groupby('id').apply(lambda group: group.fillna(method='ffill')).reset_index(drop=True)


In [None]:
# MICE for the rest missing value (with no "recent" value)

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

# Define the MICE imputer with provided parameters
mice_imputer = IterativeImputer(estimator=LinearRegression(), max_iter=10, random_state=0)

# Columns to be imputed using MICE (excluding 'id', 'time_slot', and 'charttime')
columns_to_impute = df_imputed.columns.difference(['id', 'time_slot', 'charttime'])

# Perform MICE imputation on the dataset
df_mice_imputed = df_imputed.copy()
df_mice_imputed[columns_to_impute] = mice_imputer.fit_transform(df_imputed[columns_to_impute])


In [None]:
## Deal with outliers

# Plot of original value

import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of the visualization
sns.set(style="whitegrid")

# Plotting distributions of each variable (except the first 3 columns)
fig, axes = plt.subplots(nrows=len(columns_to_standardize)//3, ncols=3, figsize=(18, 20))
fig.tight_layout(pad=5.0)

for i, col in enumerate(columns_to_standardize):
    row = i // 3
    col_num = i % 3
    sns.histplot(df_new[col], kde=True, ax=axes[row, col_num])
    axes[row, col_num].set_title(col)

# Adjust layout to make room for titles and prevent overlap
plt.subplots_adjust(top=0.95)
plt.suptitle('Distribution of Variables in the Dataset', fontsize=16)
plt.show()


In [None]:
# Plotting of s.d.

fig, axes = plt.subplots(nrows=len(columns_to_standardize)//3, ncols=3, figsize=(18, 20))
fig.tight_layout(pad=5.0)

for i, (column, outliers) in enumerate(outliers_dict.items()):
    if not outliers.empty:
        # Calculate the deviations
        mean = df_new[column].mean()
        std = df_new[column].std()
        deviations = (outliers[column] - mean) / std

        # Separate positive and negative deviations
        pos_dev = deviations[deviations > 0]
        neg_dev = deviations[deviations < 0]

        # Plot
        row = i // 3
        col_num = i % 3
        sns.histplot(pos_dev, ax=axes[row, col_num], bins=30, kde=False, color='blue', label='Positive Deviation')
        sns.histplot(neg_dev, ax=axes[row, col_num], bins=30, kde=False, color='red', label='Negative Deviation')
        axes[row, col_num].set_title(f'Deviation Distribution for {column}')
        axes[row, col_num].set_xlabel('Deviation (number of standard deviations)')
        axes[row, col_num].legend()

# Adjust layout to make room for titles and prevent overlap
plt.subplots_adjust(top=0.95)
plt.suptitle('Distribution of Positive and Negative Deviations for Outliers in Each Variable', fontsize=16)
plt.show()


# Function to calculate the fraction of outliers for each variable
def calculate_outlier_fractions(dataframe, outliers_dict):
    outlier_fractions = {}
    total_count = len(dataframe)
    for column, outliers in outliers_dict.items():
        if not outliers.empty:
            # Calculate fraction of outliers for the column
            outlier_count = len(outliers)
            fraction = outlier_count / total_count
            outlier_fractions[column] = fraction

    return outlier_fractions

# Calculate and display the fraction of outliers for each variable
outlier_fractions = calculate_outlier_fractions(df_new, outliers_dict)
outlier_fractions

In [None]:
# Function to mark outliers as NaN that exceed a threshold of standard deviations from the mean
def mark_outliers_as_nan(dataframe, columns, threshold=5):
    updated_dataframe = dataframe.copy()
    for column in columns:
        mean = updated_dataframe[column].mean()
        std = updated_dataframe[column].std()

        # Define the upper and lower bounds
        lower_bound = mean - threshold * std
        upper_bound = mean + threshold * std

        # Mark outliers as NaN
        updated_dataframe[column] = updated_dataframe[column].mask((updated_dataframe[column] < lower_bound) | (updated_dataframe[column] > upper_bound))

    return updated_dataframe

# Mark outliers that exceed 5 standard deviations from the mean as NaN for each variable
df_marked_outliers = mark_outliers_as_nan(df_new, columns_to_standardize)


In [None]:
# Define the MICE imputer with specified parameters
mice_imputer_with_params = IterativeImputer(estimator=LinearRegression(), max_iter=10, random_state=0)

# Perform MICE imputation on the dataset, focusing on columns to impute
df_mice_imputed_with_params = df_with_marked_outliers.copy()
df_mice_imputed_with_params[columns_to_standardize] = mice_imputer_with_params.fit_transform(df_with_marked_outliers[columns_to_standardize])

In [None]:
# Make the time_step consistent & remove individuals with only 1 record

# Group by 'patient' and count the number of rows for each patient
grouped_all = df.groupby('patient').size()

# Find patients with only one record
patients_with_one_row = grouped_all[grouped_all == 1]

# The length of this series will give us the number of individuals who only have one row
num_individuals_one_row = len(patients_with_one_row)
print(num_individuals_one_row)


removed_individuals_ids = patients_with_one_row.index.tolist()
df_cleaned = df[~df['patient'].isin(patients_with_one_row.index)]

# Return the number of rows in the cleaned dataset and a sample of the removed IDs
len(df_cleaned), removed_individuals_ids[:5]  # Displaying first 5 IDs as a sample




