In [None]:
import sys
import os
import pandas as pd
from pprint import pprint

# Import your custom modules. Adjust the module paths as needed.
from data.load_data import loadTrainingData
from plots.feature_plots import plot_missingness


In [None]:
# Define directories and max_files manually.
directories = ['../../training_setA/', '../../training_setB/']
max_files = None  # Change this to a number (e.g., 1000) if you want to limit the number of files
ignore_columns = ['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime']

dfs = []
patient_dict = {}

for directory in directories:
    # Build the path pattern for .psv files in the directory.
    pattern = os.path.join(directory, "*.psv")
    print(f"\nLoading data from: {pattern} with max_files={max_files}")
    patient_data = loadTrainingData(pattern, max_files)
    
    patient_dict.update(patient_data)
    
    # Concatenate all DataFrames from the current directory.
    combined_df = pd.concat(list(patient_data.values()), ignore_index=True)
    dfs.append(combined_df)

# Concatenate the combined DataFrames from all directories.
all_data = pd.concat(dfs, ignore_index=True)
print(f"\nCombined training set shape: {all_data.shape}")


In [None]:
plot_missingness(all_data, title="Missing Data")

In [None]:
all_data.head(5)

In [None]:
all_data.columns

In [None]:
# Improve Missingness here
from data.clean_data import forwardFillMAP

all_data = forwardFillMAP(all_data)
plot_missingness(all_data, title="Missing Data + Map")

In [None]:
from data.clean_data import forwardFillDBP

all_data = forwardFillDBP(all_data)
plot_missingness(all_data, title="Missing Data + Map + DBP")

In [None]:
from data.clean_data import forwardFillSBP

all_data = forwardFillSBP(all_data)
plot_missingness(all_data, title="Missing Data + Map + DBP + SBP")

In [None]:
import matplotlib.pyplot as plt
    
df = all_data.copy()

complete = df[['pH', 'PaCO2', 'HCO3']].dropna()
corr_matrix = complete.corr()
print("Correlation Matrix for pH, PaCO2, and HCO3:")
print(corr_matrix)

# Optionally, visualize the correlation matrix.
plt.figure(figsize=(6, 4))
plt.matshow(corr_matrix, fignum=1)
plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=45)
plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
plt.colorbar()
plt.title("Acid-Base Correlation Matrix", pad=20)
plt.show()


In [None]:
from data.correlation_data import test_pH_equation_accuracy

test_pH_equation_accuracy(df)

In [None]:
from data.clean_data import forwardFillHasselbalch

all_data = forwardFillHasselbalch(all_data)
plot_missingness(all_data, title="Missing Data + Hassel")

In [None]:
all_data['Bilirubin_total'].describe()

In [None]:
all_data['Bilirubin_direct'].describe()

In [None]:
from data.clean_data import forwardFillData

clean_df = forwardFillData(all_data)
# plot_missingness(clean_df, title="Cleaned Data")

In [None]:
from data.clean_data import backShiftSepsisLabel


patient_record = None
for df in patient_dict.values():
    if df['SepsisLabel'].sum() > 0:
        patient_record = df
        break


patient_record.tail(10)


In [None]:
shifted_patient_record = backShiftSepsisLabel(patient_record)

shifted_patient_record.tail(10)

In [None]:
# Count avg num of sepsis labels for a septic patient
# Take a dictionary of patient dataframes
# If they have at least one septic label
# Add the column to a list to calculate the avg number of rows for a septic patient and the avg number of septic labels = 1 for a patient
# also record the min and max number of septic label = 1 for a patient that has sepsis

In [None]:
from data.helper_data import count_sepsis_labels

count_sepsis_labels(patient_dict)

In [None]:
from models.mice.encode_data import encode_dict_deltas

encoded_patient_dict = encode_dict_deltas(patient_dict)

In [None]:
"""
Function takes a dict of patient dataframes
Create Sliding Window feature vectors or some many of feature vectors
Add Engineered features

RunLGBM

Eval outputs

Grid Search - Machine Learning library to run 10k combos with json input

"""

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

patient_record = None
for df in encoded_patient_dict.values():
    if df['SepsisLabel'].sum() > 0:
        patient_record = df
        break
        
patient_record.head(10)

In [None]:
from models.mice.encode_data import merge_patient_dict

encoded_df = merge_patient_dict(encoded_patient_dict, 'SepsisLabel_delta')

In [None]:
plot_missingness(encoded_df, title="Encoded Missing Values")

In [None]:
from models.mice.encode_data import impute_delta_features

imputed_df = impute_delta_features(encoded_df)

In [None]:
# imputed_df.head(10)
imputed_df.head(10)
# plot_missingness(imputed_df, title="Imputed Missing Values")

In [None]:
from models.mice.encode_data import split_and_restructure

imputed_df.head(10)
imputed_dict = split_and_restructure(imputed_df)

In [None]:
plot_missingness(imputed_df, title="Imputed Missing Values")

In [None]:
all_data["EtCO2"].describe()

In [None]:
temp = []
for df in imputed_dict.values():
    temp = df
    break
temp.head(100)

In [None]:
from models.mice.encode_data import reconstruct_vitals
from tqdm import tqdm

reconstructed_impute_dict = {}

for patient_id, df in tqdm(imputed_dict.items(), desc='reconstructing'):
    reconstructed_df = reconstruct_vitals(df.copy(), max_iter=50)
    reconstructed_impute_dict[patient_id] = reconstructed_df

In [None]:
from models.lgbm_impl import train_and_evaluate_lgbm
print("Starting")
result = train_and_evaluate_lgbm(reconstructed_impute_dict, window=6)

# Access the results
print("Accuracy:", result["accuracy"])
print("\nClassification Report:\n", result["classification_report"])

In [None]:
patient_record = None
for df in reconstructed_impute_dict.values():
    patient_record = df
    break
patient_record.head(100)