In [1]:
import numpy as np
X1 = np.load('embeddings_1.npy').astype(np.float32)
X2 = np.load('embeddings_2.npy').astype(np.float32)
X_train = np.concatenate((X1, X2), axis=0)
X_test = np.load('test_data.npy').astype(np.float32)

In [2]:
num_rows = len(X_train)           # Number of rows
num_columns = len(X_train[0])     # Number of columns (assuming non-empty and rectangular)
print("X dimensions:", (num_rows, num_columns)) 

X dimensions: (198982, 1024)


In [3]:
num_rows = len(X_test)           # Number of rows
num_columns = len(X_test[0])     # Number of columns (assuming non-empty and rectangular)
print("X dimensions:", (num_rows, num_columns)) 

X dimensions: (99490, 1024)


In [4]:
import tensorflow as tf

# Step 1: Read label data from files (assuming you have already defined this part)
label_data = []
file_names = ['icd_codes_1.txt', 'icd_codes_2.txt']  # Update with actual filenames
for file_name in file_names:
    with open(file_name, 'r') as file:
        label_data.extend(line.strip() for line in file if line.strip())

# Step 2: Create a set of unique ICD-10 codes for efficient lookup
unique_codes = set()
for labels in label_data:
    unique_codes.update(labels.split(";"))
unique_codes = sorted(unique_codes)  # Convert to a sorted list at the end

# Step 3: Initialize the StringLookup layer
lookup_layer = tf.keras.layers.StringLookup(vocabulary=unique_codes, output_mode="multi_hot", mask_token=None,num_oov_indices=0)

# Step 4: Create a tf.data.Dataset to handle large data efficiently
label_data_ds = tf.data.Dataset.from_tensor_slices(label_data)

# Step 5: Define a function to encode each label set
def encode_labels(labels):
    return lookup_layer(tf.strings.split(labels, sep=";"))

# Step 6: Map encoding function over the dataset and batch it
# Batch processing reduces memory usage
multi_hot_labels_ds = label_data_ds.map(encode_labels, num_parallel_calls=tf.data.AUTOTUNE).batch(1000)

# Step 7: Concatenate all batches to get the final `y` tensor
y = tf.concat(list(multi_hot_labels_ds), axis=0)

# Ensure the correct shape of `y`
print("Shape of y:", y.shape)  # Should output: (200000, 1400)

y_train = y.numpy()


Shape of y: (198982, 1400)


In [5]:
print("Shape of X:", X_train.shape)
print("Shape of y:", y_train.shape)


Shape of X: (198982, 1024)
Shape of y: (198982, 1400)


In [7]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print("X_train shape:", X_train.shape)
# print("y_train shape:", y_train.shape)
# print("X_test shape:", X_test.shape)
# print("y_test shape:", y_test.shape)

In [6]:
from sklearn.preprocessing import StandardScaler

# Assuming X_train and X_test are your feature matrices
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
X_test.dtype

dtype('float32')

In [14]:
import pandas as pd
import os
import tensorflow as tf

# Define output folder for CSV files
output_folder = "label_files"
os.makedirs(output_folder, exist_ok=True)

# Save each column as a separate CSV file
for label_index in range(y_train.shape[1]):
    # Extract each column (label) and save as a binary CSV
    label_data = pd.DataFrame(y_train[:, label_index], columns=[f'label_{label_index}'])
    label_data.to_csv(f"{output_folder}/label_{label_index}.csv", index=False, header=False)

print("CSV files created in the 'label_files' folder.")


CSV files created in the 'label_files' folder.


In [12]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from joblib import Parallel, delayed
import joblib
import os

# Folder paths
output_folder = "label_files"
model_folder = "models"
os.makedirs(model_folder, exist_ok=True)

# Define the function for training a LightGBM model for each label
def train_lightgbm_for_label(label_index):
    # Load the binary target data for this label
    y_train = pd.read_csv(f"{output_folder}/label_{label_index}.csv", header=None).values.ravel()
    
    # Initialize and train the LightGBM model
    print(f"Training LightGBM model for label {label_index}")
    model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05)
    model.fit(X_train, y_train)
    
    # Save the model
    model_path = f"{model_folder}/lgb_model_{label_index}.pkl"
    joblib.dump(model, model_path)
    print(f"Model for label {label_index} saved at {model_path}")
    
    return model

# Train each label model in parallel
print("Starting training of LightGBM models...")
Parallel(n_jobs=16)(delayed(train_lightgbm_for_label)(i) for i in range(y_train.shape[1]))
print("Training complete. LightGBM models saved in the 'models' folder.")


Starting training of LightGBM models...
Training complete. LightGBM models saved in the 'models' folder.


In [10]:
import os

output_folder = "label_files"
label_files = os.listdir(output_folder)
print(f"Number of label files found: {len(label_files)}")


Number of label files found: 1400


In [11]:
print("Shape of X_train:", X_train.shape)


Shape of X_train: (198982, 1024)


In [17]:
import tensorflow as tf
import numpy as np
import pandas as pd
import joblib
import os
import lightgbm as lgb


num_labels = len(unique_codes)

# Initialize the StringLookup layer for reverse lookup
lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_codes, invert=True, output_mode="int", mask_token=None, num_oov_indices=0
)

# Folder where models are saved
model_folder = "models"

# Array to store binary predictions for each label
y_pred = np.zeros((X_test.shape[0], num_labels), dtype=int)

# Predict for each label using the corresponding model
print("Starting predictions for each label...")
for label_index in range(num_labels):
    print(f"Processing predictions for label {label_index}")
    
    # Initialize predictions from both models if available
    lgb_pred = None
    lr_pred = None
    
    # Check if the LightGBM model exists
    lgb_model_path = f"{model_folder}/lgb_model_{label_index}.pkl"
    if os.path.exists(lgb_model_path):
        print(f"Loading LightGBM model for label {label_index}")
        lgb_model = joblib.load(lgb_model_path)
        lgb_pred = lgb_model.predict_proba(X_test)[:, 1]  # Positive class probabilities
    
    # Check if the Logistic Regression model exists
    lr_model_path = f"{model_folder}/model_{label_index}.pkl"
    if os.path.exists(lr_model_path):
        print(f"Loading Logistic Regression model for label {label_index}")
        lr_model = joblib.load(lr_model_path)
        lr_pred = lr_model.predict_proba(X_test)[:, 1]
    
    # Combine predictions (weighted average if both models are available)
    if lgb_pred is not None and lr_pred is not None:
        y_pred_probs = 0.3 * lgb_pred + 0.7 * lr_pred
    elif lgb_pred is not None:
        y_pred_probs = lgb_pred
    elif lr_pred is not None:
        y_pred_probs = lr_pred
    else:
        print(f"No model found for label {label_index}, skipping...")
        continue

    # Apply threshold to get binary predictions
    y_pred[:, label_index] = (y_pred_probs > 0.7).astype(int)
    print(f"Predictions complete for label {label_index}")

# Convert binary predictions to ICD-10 codes
print("Converting binary predictions to ICD-10 codes...")
predicted_indices = [np.where(pred_row == 1)[0] for pred_row in y_pred]
predicted_codes = [lookup_layer(indices).numpy() for indices in predicted_indices]
predicted_codes = [[code.decode('utf-8') for code in row] for row in predicted_codes]

# Join ICD-10 codes with semicolons for each test instance
predicted_labels = [';'.join(row) for row in predicted_codes]

# Create the final submission DataFrame
print("Creating submission DataFrame...")
submission_df = pd.DataFrame({
    'id': range(1, len(predicted_labels) + 1),
    'labels': predicted_labels
})

# Save to CSV file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to 'submission.csv'.")

Starting predictions for each label...
Processing predictions for label 0
Loading LightGBM model for label 0
Loading Logistic Regression model for label 0
Predictions complete for label 0
Processing predictions for label 1
Loading LightGBM model for label 1
Loading Logistic Regression model for label 1
Predictions complete for label 1
Processing predictions for label 2
Loading LightGBM model for label 2
Loading Logistic Regression model for label 2
Predictions complete for label 2
Processing predictions for label 3
Loading LightGBM model for label 3
Loading Logistic Regression model for label 3
Predictions complete for label 3
Processing predictions for label 4
Loading LightGBM model for label 4
Loading Logistic Regression model for label 4
Predictions complete for label 4
Processing predictions for label 5
Loading LightGBM model for label 5
Loading Logistic Regression model for label 5
Predictions complete for label 5
Processing predictions for label 6
Loading LightGBM model for label

In [16]:
import tensorflow as tf
import numpy as np
import pandas as pd
import joblib
import os
import lightgbm as lgb


num_labels = len(unique_codes)

# Initialize the StringLookup layer for reverse lookup
lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_codes, invert=True, output_mode="int", mask_token=None, num_oov_indices=0
)

# Folder where models are saved
model_folder = "models"

# Array to store binary predictions for each label
y_pred = np.zeros((X_test.shape[0], num_labels), dtype=int)

# Prediction threshold (try adjusting this if needed)
threshold = 0.9  # Adjust as necessary (experiment with values like 0.2, 0.3, 0.4)

# Predict for each label using the corresponding LightGBM model
print("Starting predictions for each label with LightGBM...")
for label_index in range(num_labels):
    print(f"Processing predictions for label {label_index}")
    
    # Check if the LightGBM model exists
    lgb_model_path = f"{model_folder}/lgb_model_{label_index}.pkl"
    if os.path.exists(lgb_model_path):
        # Load LightGBM model and predict
        lgb_model = joblib.load(lgb_model_path)
        y_pred_probs = lgb_model.predict_proba(X_test)[:, 1]  # Positive class probabilities
        
        # Apply threshold to get binary predictions
        y_pred[:, label_index] = (y_pred_probs > threshold).astype(int)
        print(f"Predictions for label {label_index}, sample probabilities: {y_pred_probs[:5]}")
    else:
        print(f"No LightGBM model found for label {label_index}, skipping...")

# Convert binary predictions to ICD-10 codes
print("Converting binary predictions to ICD-10 codes...")
predicted_indices = [np.where(pred_row == 1)[0] for pred_row in y_pred]
predicted_codes = [lookup_layer(indices).numpy() for indices in predicted_indices]
predicted_codes = [[code.decode('utf-8') for code in row] for row in predicted_codes]

# Join ICD-10 codes with semicolons for each test instance
predicted_labels = [';'.join(row) for row in predicted_codes]

# Create the final submission DataFrame
submission_df = pd.DataFrame({
    'id': range(1, len(predicted_labels) + 1),
    'labels': predicted_labels
})

# Debug: Print sample output for verification
print("Sample predictions for verification:")
print(submission_df.head(10))

# Save to CSV file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to 'submission.csv'.")


Starting predictions for each label with LightGBM...
Processing predictions for label 0
Predictions for label 0, sample probabilities: [4.05975585e-06 3.74206549e-06 3.90012826e-06 2.66988883e-06
 3.94098671e-06]
Processing predictions for label 1
Predictions for label 1, sample probabilities: [5.54523394e-09 5.52302354e-09 6.83318921e-27 2.08267266e-52
 1.07055666e-08]
Processing predictions for label 2
Predictions for label 2, sample probabilities: [3.11424900e-06 1.87158933e-06 2.65943820e-06 2.22688355e-06
 2.19524361e-06]
Processing predictions for label 3
Predictions for label 3, sample probabilities: [3.67984454e-07 3.67984454e-07 3.65500170e-07 0.00000000e+00
 3.67984454e-07]
Processing predictions for label 4
Predictions for label 4, sample probabilities: [3.70006763e-178 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000]
Processing predictions for label 5
Predictions for label 5, sample probabilities: [4.83166431e-06 3.67628732e-06 2.66949570e-06 2.65575679e-06