# Data input

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.metrics import fbeta_score

In [4]:
import numpy as np
X = np.load('embeddings_1.npy')
X_2 = np.load('embeddings_2.npy')
X = np.concatenate((X, X_2), axis=0)

In [5]:
num_rows = len(X)           # Number of rows
num_columns = len(X[0])     # Number of columns (assuming non-empty and rectangular)
print("X dimensions:", (num_rows, num_columns)) 

X dimensions: (198982, 1024)


In [6]:
import tensorflow as tf

# Step 1: Read label data from files (assuming you have already defined this part)
label_data = []
file_names = ['icd_codes_1.txt', 'icd_codes_2.txt']  # Update with actual filenames
for file_name in file_names:
    with open(file_name, 'r') as file:
        label_data.extend(line.strip() for line in file if line.strip())

# Step 2: Create a set of unique ICD-10 codes for efficient lookup
unique_codes = set()
for labels in label_data:
    unique_codes.update(labels.split(";"))
unique_codes = sorted(unique_codes)  # Convert to a sorted list at the end

# Step 3: Initialize the StringLookup layer
lookup_layer = tf.keras.layers.StringLookup(vocabulary=unique_codes, output_mode="multi_hot", mask_token=None,num_oov_indices=0)

# Step 4: Create a tf.data.Dataset to handle large data efficiently
label_data_ds = tf.data.Dataset.from_tensor_slices(label_data)

# Step 5: Define a function to encode each label set
def encode_labels(labels):
    return lookup_layer(tf.strings.split(labels, sep=";"))

# Step 6: Map encoding function over the dataset and batch it
# Batch processing reduces memory usage
multi_hot_labels_ds = label_data_ds.map(encode_labels, num_parallel_calls=tf.data.AUTOTUNE).batch(1000)

# Step 7: Concatenate all batches to get the final `y` tensor
y = tf.concat(list(multi_hot_labels_ds), axis=0)

# Ensure the correct shape of `y`
print("Shape of y:", y.shape)  # Should output: (200000, 1400)

y = y.numpy()


Shape of y: (198982, 1400)


In [None]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (198982, 1024)
Shape of y: (198982, 1400)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_val.shape)
print("y_test shape:", y_val.shape)

X_train shape: (159185, 1024)
y_train shape: (159185, 1400)
X_test shape: (39797, 1024)
y_test shape: (39797, 1400)


In [None]:
import tensorflow as tf

@tf.keras.saving.register_keras_serializable()
class MicroF2Score(tf.keras.metrics.Metric):
    def __init__(self, name='micro_f2_score', beta=2, **kwargs):
        super(MicroF2Score, self).__init__(name=name, **kwargs)
        self.beta = beta
        self.tp = self.add_weight(name='tp', initializer='zeros')
        self.fp = self.add_weight(name='fp', initializer='zeros')
        self.fn = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Threshold y_pred to get binary predictions
        y_pred = tf.cast(y_pred > 0.5, tf.float32)
        
        # Cast y_true to float32 to ensure compatibility
        y_true = tf.cast(y_true, tf.float32)

        # Calculate true positives, false positives, and false negatives
        true_positive = tf.reduce_sum(y_true * y_pred)
        false_positive = tf.reduce_sum(y_pred * (1 - y_true))
        false_negative = tf.reduce_sum((1 - y_pred) * y_true)

        # Update the corresponding weights
        self.tp.assign_add(true_positive)
        self.fp.assign_add(false_positive)
        self.fn.assign_add(false_negative)

    def result(self):
        precision = self.tp / (self.tp + self.fp + tf.keras.backend.epsilon())
        recall = self.tp / (self.tp + self.fn + tf.keras.backend.epsilon())
        f_beta = (1 + self.beta**2) * (precision * recall) / (self.beta**2 * precision + recall + tf.keras.backend.epsilon())
        return f_beta

    def reset_states(self):
        self.tp.assign(0)
        self.fp.assign(0)
        self.fn.assign(0)



AttributeError: module 'tensorflow.keras' has no attribute 'saving'

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, LayerNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import numpy as np
from sklearn.metrics import fbeta_score

# Enhanced model architecture with larger layers and increased dropout
def create_model(input_shape):
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(2048, activation='relu'),
        LayerNormalization(),
        Dropout(0.5),
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(1400, activation='sigmoid')
    ])
    return model

# Initialize the model
model = create_model(X_train.shape[1])

# Compile model with Adam optimizer and custom focal loss
optimizer = Adam(learning_rate=0.0003)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[MicroF2Score()])

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True, mode='min')
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, mode='min')

# Train the model
history = model.fit(X_train, y_train, 
                    batch_size=32,  # Optimized batch size
                    epochs=100,
                    validation_split=0.1,
                    callbacks=[early_stopping, model_checkpoint, lr_scheduler],
                    verbose=1)


Epoch 1/100
[1m4478/4478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 10ms/step - loss: 0.2011 - micro_f2_score: 0.0186 - val_loss: 0.0052 - val_micro_f2_score: 0.3503 - learning_rate: 3.0000e-04
Epoch 2/100
[1m4478/4478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 11ms/step - loss: 0.0057 - micro_f2_score: 0.2722 - val_loss: 0.0034 - val_micro_f2_score: 0.5474 - learning_rate: 3.0000e-04
Epoch 3/100
[1m4478/4478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 10ms/step - loss: 0.0040 - micro_f2_score: 0.4220 - val_loss: 0.0026 - val_micro_f2_score: 0.6313 - learning_rate: 3.0000e-04
Epoch 4/100
[1m4478/4478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 10ms/step - loss: 0.0033 - micro_f2_score: 0.5189 - val_loss: 0.0023 - val_micro_f2_score: 0.6915 - learning_rate: 3.0000e-04
Epoch 5/100
[1m4478/4478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 10ms/step - loss: 0.0029 - micro_f2_score: 0.5705 - val_loss: 0.0022 - val_micro_f2_score: 

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, LeakyReLU
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
# from tensorflow.keras.optimizers import Adam
# import tensorflow as tf
# import numpy as np
# from sklearn.metrics import fbeta_score

# # Define the model using the Sequential API with added complexity and batch normalization
# def create_model(input_shape):
#     model = Sequential([
#         Input(shape=(input_shape,)),  # Specify the input shape directly
#         Dense(1024, activation='relu'),
#         BatchNormalization(),
#         Dropout(0.4),  # Increased dropout rate for better regularization
#         Dense(512, activation='relu'),
#         BatchNormalization(),
#         Dropout(0.4),
#         Dense(256, activation='relu'),
#         BatchNormalization(),
#         Dropout(0.4),
#         Dense(1400, activation='sigmoid')  # Sigmoid activation for multi-label classification
#     ])
#     return model

# # Create the model
# model = create_model(X_train.shape[1])

# # Compile the model with Adam optimizer, focal loss, and custom F2Score metric
# optimizer = Adam(learning_rate=0.0005)
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[MicroF2Score()])

# # EarlyStopping to monitor the validation F2 score, aiming to stop training when F2 score stops improving
# early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, mode='min')

# # ModelCheckpoint to save the model weights only when the validation F2 score improves
# model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')

# # Learning rate scheduler to reduce the learning rate when validation loss plateaus
# lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, mode='min')

# # Fit the model on the training data
# history = model.fit(X_train, y_train, 
#                     batch_size=32,  # Increased batch size for stability
#                     epochs=70,      # Increased number of epochs
#                     validation_split=0.1,  # Use part of the training set for validation
#                     callbacks=[early_stopping, model_checkpoint, lr_scheduler],
#                     verbose=1)

# # Post-training: Optimize threshold for F2-score
# def tune_thresholds(y_true, y_pred, beta=2.0, num_thresholds=50):
#     best_thresholds = []
#     for i in range(y_true.shape[1]):
#         best_f2 = 0
#         best_threshold = 0.5
#         for threshold in np.linspace(0.1, 0.9, num_thresholds):
#             y_pred_bin = (y_pred[:, i] >= threshold).astype(int)
#             f2 = fbeta_score(y_true[:, i], y_pred_bin, beta=beta, average='micro', zero_division=1)
#             if f2 > best_f2:
#                 best_f2 = f2
#                 best_threshold = threshold
#         best_thresholds.append(best_threshold)
#     return best_thresholds

# # Get predictions on validation set
# y_val_pred = model.predict(X_val)

# # Find optimal thresholds for each label
# best_thresholds = tune_thresholds(y_val, y_val_pred)

# # Apply thresholds to validation predictions
# y_val_pred_bin = (y_val_pred >= best_thresholds).astype(int)

# # Calculate validation F2 score with optimized thresholds
# validation_f2 = fbeta_score(y_val, y_val_pred_bin, beta=2, average='micro', zero_division=1)
# print("Optimized Micro-F2 Score on validation set:", validation_f2)

# # Inference function
# def predict_with_thresholds(model, X, thresholds):
#     y_pred = model.predict(X)
#     y_pred_bin = (y_pred >= thresholds).astype(int)
#     return y_pred_bin

# # Example usage for test set
# # y_test_pred = predict_with_thresholds(model, X_test, best_thresholds)


Epoch 1/70
[1m2239/2239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - loss: 0.2161 - micro_f2_score: 0.0218 - val_loss: 0.0053 - val_micro_f2_score: 0.3289 - learning_rate: 5.0000e-04
Epoch 2/70
[1m2239/2239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - loss: 0.0057 - micro_f2_score: 0.3030 - val_loss: 0.0034 - val_micro_f2_score: 0.5288 - learning_rate: 5.0000e-04
Epoch 3/70
[1m2239/2239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 12ms/step - loss: 0.0040 - micro_f2_score: 0.4505 - val_loss: 0.0025 - val_micro_f2_score: 0.6488 - learning_rate: 5.0000e-04
Epoch 4/70
[1m2239/2239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step - loss: 0.0032 - micro_f2_score: 0.5452 - val_loss: 0.0022 - val_micro_f2_score: 0.6946 - learning_rate: 5.0000e-04
Epoch 5/70
[1m2239/2239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - loss: 0.0028 - micro_f2_score: 0.6003 - val_loss: 0.0020 - val_micro_f2_score: 0.7330 - 

## Neural Network

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, LeakyReLU
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
# from tensorflow.keras.optimizers import Adam
# from sklearn.metrics import fbeta_score
# import tensorflow as tf
# import numpy as np

# # Define the model using the Sequential API with added complexity and batch normalization
# model = Sequential([
#     Input(shape=(X_train.shape[1],)),  # Specify the input shape directly
#     Dense(1024, activation='relu'),
#     BatchNormalization(),
#     Dropout(0.3),
#     Dense(512, activation='relu'),
#     BatchNormalization(),
#     Dropout(0.3),
#     Dense(256, activation='relu'),
#     BatchNormalization(),
#     Dropout(0.3),
#     Dense(1400, activation='sigmoid')  # Sigmoid activation for multi-label classification
# ])

# # Compile the model with Adam optimizer and binary crossentropy loss
# optimizer = Adam(learning_rate=0.0005)

# class F2Score(tf.keras.metrics.Metric):
#     def __init__(self, name='f2_score', beta=2, **kwargs):
#         super(F2Score, self).__init__(name=name, **kwargs)
#         self.beta = beta
#         self.tp = self.add_weight(name='tp', initializer='zeros')
#         self.actual_positive = self.add_weight(name='actual_positive', initializer='zeros')
#         self.predicted_positive = self.add_weight(name='predicted_positive', initializer='zeros')

#     def update_state(self, y_true, y_pred, sample_weight=None):
#         y_true = tf.cast(y_true, tf.bool)
#         y_pred = tf.cast(y_pred > 0.5, tf.bool)  # Threshold can be adjusted

#         true_positive = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
#         self.tp.assign_add(tf.reduce_sum(tf.cast(true_positive, self.dtype)))
#         self.actual_positive.assign_add(tf.reduce_sum(tf.cast(y_true, self.dtype)))
#         self.predicted_positive.assign_add(tf.reduce_sum(tf.cast(y_pred, self.dtype)))

#     def result(self):
#         precision = self.tp / (self.predicted_positive + tf.keras.backend.epsilon())
#         recall = self.tp / (self.actual_positive + tf.keras.backend.epsilon())
#         f_beta = (1 + self.beta**2) * (precision * recall) / (self.beta**2 * precision + recall + tf.keras.backend.epsilon())
#         return f_beta

#     def reset_states(self):
#         self.tp.assign(0)
#         self.actual_positive.assign(0)
#         self.predicted_positive.assign(0)

# # Use F2Score in model compilation
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[F2Score()])

# # EarlyStopping to monitor the validation F2 score, aiming to stop training when F2 score stops improving
# early_stopping = EarlyStopping(monitor='val_f2_score', patience=10, restore_best_weights=True, mode='max')

# # ModelCheckpoint to save the model weights only when the validation F2 score improves
# model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_f2_score', mode='max')

# # Fit the model on the training data
# history = model.fit(X_train, y_train, 
#                     batch_size=32,  # Batch size can be adjusted based on system capabilities
#                     epochs=100,      # Increased number of epochs
#                     validation_split=0.1,  # Use part of the training set for validation
#                     callbacks=[early_stopping, model_checkpoint],
#                     verbose=1)


# output


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

# Load test data
X_test = np.load('test_data.npy')

# Predict probabilities and convert to binary
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

# Prepare reverse lookup
lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_codes, invert=True, output_mode="int", mask_token=None, num_oov_indices=0
)

# Convert predictions to ICD10 codes
predicted_indices = [np.where(pred_row == 1)[0] for pred_row in y_pred]
predicted_codes = [lookup_layer(indices).numpy() for indices in predicted_indices]
predicted_codes = [[code.decode('utf-8') for code in row] for row in predicted_codes]
predicted_labels = [';'.join(row) for row in predicted_codes]

# Create and save submission DataFrame
submission_df = pd.DataFrame({
    'id': range(1, len(predicted_labels) + 1),
    'labels': predicted_labels
})
submission_df.to_csv('submission.csv', index=False)

In [57]:
import tensorflow as tf

@tf.keras.utils.register_keras_serializable()  # Register the class
class MicroF2Score(tf.keras.metrics.Metric):
    def __init__(self, name='micro_f2_score', beta=2, **kwargs):
        super(MicroF2Score, self).__init__(name=name, **kwargs)
        self.beta = beta
        self.tp = self.add_weight(name='tp', initializer='zeros')
        self.fp = self.add_weight(name='fp', initializer='zeros')
        self.fn = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Threshold y_pred to get binary predictions
        y_pred = tf.cast(y_pred > 0.5, tf.float32)
        
        # Cast y_true to float32 to ensure compatibility
        y_true = tf.cast(y_true, tf.float32)

        # Calculate true positives, false positives, and false negatives
        true_positive = tf.reduce_sum(y_true * y_pred)
        false_positive = tf.reduce_sum(y_pred * (1 - y_true))
        false_negative = tf.reduce_sum((1 - y_pred) * y_true)

        # Update the corresponding weights
        self.tp.assign_add(true_positive)
        self.fp.assign_add(false_positive)
        self.fn.assign_add(false_negative)

    def result(self):
        precision = self.tp / (self.tp + self.fp + tf.keras.backend.epsilon())
        recall = self.tp / (self.tp + self.fn + tf.keras.backend.epsilon())
        f_beta = (1 + self.beta**2) * (precision * recall) / (self.beta**2 * precision + recall + tf.keras.backend.epsilon())
        return f_beta

    def reset_states(self):
        self.tp.assign(0)
        self.fp.assign(0)
        self.fn.assign(0)


In [112]:
model = tf.keras.models.load_model('model.keras')

In [46]:
# Function to find optimal threshold for each label
def tune_thresholds(y_true, y_pred, beta=2.0, num_thresholds=50):
    best_thresholds = []
    for i in range(y_true.shape[1]):
        best_f2 = 0
        best_threshold = 0.5
        for threshold in np.linspace(0.1, 0.9, num_thresholds):
            y_pred_bin = (y_pred[:, i] >= threshold).astype(int)
            f2 = fbeta_score(y_true[:, i], y_pred_bin, beta=beta, average='micro', zero_division=1)
            if f2 > best_f2:
                best_f2 = f2
                best_threshold = threshold
        best_thresholds.append(best_threshold)
    return best_thresholds

# Get predictions on the validation set
y_val_pred = model.predict(X_val)

# Find optimal thresholds for each label
best_thresholds = tune_thresholds(y_val, y_val_pred)

# Apply optimized thresholds to validation predictions
y_val_pred_bin = np.array([(y_val_pred[:, i] >= best_thresholds[i]).astype(int) for i in range(y_val_pred.shape[1])]).T

# Calculate and print the validation F2 score with optimized thresholds
validation_f2 = fbeta_score(y_val, y_val_pred_bin, beta=2, average='micro', zero_division=1)
print("Optimized Micro-F2 Score on validation set:", validation_f2)

# Inference function with optimized thresholds for test data
def predict_with_thresholds(model, X, thresholds):
    y_pred = model.predict(X)
    y_pred_bin = np.array([(y_pred[:, i] >= thresholds[i]).astype(int) for i in range(y_pred.shape[1])]).T
    return y_pred_bin

# Example usage for test set predictions
# y_test_pred = predict_with_thresholds(model, X_test, best_thresholds)

[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
Optimized Micro-F2 Score on validation set: 0.8773769625331844


In [47]:
import tensorflow as tf
import numpy as np
import pandas as pd

# Load best model

# Load test data
X_test = np.load('test_data.npy')

# Predict probabilities and convert to binary using the tuned thresholds
y_test_pred = predict_with_thresholds(model, X_test, best_thresholds)

# Prepare reverse lookup
lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_codes, invert=True, output_mode="int", mask_token=None, num_oov_indices=0
)

# Convert predictions to ICD10 codes
predicted_indices = [np.where(pred_row == 1)[0] for pred_row in y_test_pred]
predicted_codes = [lookup_layer(indices).numpy() for indices in predicted_indices]
predicted_codes = [[code.decode('utf-8') for code in row] for row in predicted_codes]
predicted_labels = [';'.join(row) for row in predicted_codes]

# Create and save submission DataFrame
submission_df = pd.DataFrame({
    'id': range(1, len(predicted_labels) + 1),
    'labels': predicted_labels
})
submission_df.to_csv('submission.csv', index=False)


[1m3110/3110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step


In [None]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score
import numpy as np

# Initialize ClassifierChain with Gaussian Naive Bayes
classifier = ClassifierChain(LogisticRegression())

# Train the model
classifier.fit(X, y)



  predicted_indices = [np.where(pred_row == 1)[0] for pred_row in y_test_pred]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

In [32]:

## Predict on the test set
y_test_pred = classifier.predict(X_test)

# Convert sparse matrix to dense if necessary
y_test_pred_dense = y_test_pred.toarray() if hasattr(y_test_pred, "toarray") else y_test_pred

# Prepare reverse lookup
lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_codes, invert=True, output_mode="int", mask_token=None, num_oov_indices=0
)

# Convert y_test_pred_dense to ICD10 codes
predicted_indices = [np.where(pred_row == 1)[0] for pred_row in y_test_pred_dense]
predicted_codes = [lookup_layer(indices).numpy() for indices in predicted_indices]
predicted_codes = [[code.decode('utf-8') for code in row] for row in predicted_codes]
predicted_labels = [';'.join(row) for row in predicted_codes]

# Create and save submission DataFrame
submission_df = pd.DataFrame({
    'id': range(1, len(predicted_labels) + 1),
    'labels': predicted_labels
})
submission_df.to_csv('submission.csv', index=False)