In [None]:
import numpy as np
X1 = np.load('embeddings_1.npy')
X2 = np.load('embeddings_2.npy')
X_train = np.concatenate((X1, X2), axis=0)
X_test = np.load('test_data.npy')

In [3]:
num_rows = len(X_train)           # Number of rows
num_columns = len(X_train[0])     # Number of columns (assuming non-empty and rectangular)
print("X dimensions:", (num_rows, num_columns)) 

X dimensions: (198982, 1024)


In [4]:
num_rows = len(X_test)           # Number of rows
num_columns = len(X_test[0])     # Number of columns (assuming non-empty and rectangular)
print("X dimensions:", (num_rows, num_columns)) 

X dimensions: (99490, 1024)


In [5]:
import tensorflow as tf

# Step 1: Read label data from files (assuming you have already defined this part)
label_data = []
file_names = ['icd_codes_1.txt', 'icd_codes_2.txt']  # Update with actual filenames
for file_name in file_names:
    with open(file_name, 'r') as file:
        label_data.extend(line.strip() for line in file if line.strip())

# Step 2: Create a set of unique ICD-10 codes for efficient lookup
unique_codes = set()
for labels in label_data:
    unique_codes.update(labels.split(";"))
unique_codes = sorted(unique_codes)  # Convert to a sorted list at the end

# Step 3: Initialize the StringLookup layer
lookup_layer = tf.keras.layers.StringLookup(vocabulary=unique_codes, output_mode="multi_hot", mask_token=None,num_oov_indices=0)

# Step 4: Create a tf.data.Dataset to handle large data efficiently
label_data_ds = tf.data.Dataset.from_tensor_slices(label_data)

# Step 5: Define a function to encode each label set
def encode_labels(labels):
    return lookup_layer(tf.strings.split(labels, sep=";"))

# Step 6: Map encoding function over the dataset and batch it
# Batch processing reduces memory usage
multi_hot_labels_ds = label_data_ds.map(encode_labels, num_parallel_calls=tf.data.AUTOTUNE).batch(1000)

# Step 7: Concatenate all batches to get the final `y` tensor
y = tf.concat(list(multi_hot_labels_ds), axis=0)

# Ensure the correct shape of `y`
print("Shape of y:", y.shape)  # Should output: (200000, 1400)

y_train = y.numpy()


Shape of y: (198982, 1400)


In [6]:
print("Shape of X:", X_train.shape)
print("Shape of y:", y_train.shape)


Shape of X: (198982, 1024)
Shape of y: (198982, 1400)


In [7]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print("X_train shape:", X_train.shape)
# print("y_train shape:", y_train.shape)
# print("X_test shape:", X_test.shape)
# print("y_test shape:", y_test.shape)

In [14]:
import pandas as pd
import os
import tensorflow as tf

# Define output folder for CSV files
output_folder = "label_files"
os.makedirs(output_folder, exist_ok=True)

# Save each column as a separate CSV file
for label_index in range(y_train.shape[1]):
    # Extract each column (label) and save as a binary CSV
    label_data = pd.DataFrame(y_train[:, label_index], columns=[f'label_{label_index}'])
    label_data.to_csv(f"{output_folder}/label_{label_index}.csv", index=False, header=False)

print("CSV files created in the 'label_files' folder.")


CSV files created in the 'label_files' folder.


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from joblib import Parallel, delayed
import joblib

# Folder for saving models
model_folder = "models"
os.makedirs(model_folder, exist_ok=True)

# Function to train a model for each label
def train_model_for_label(label_index):
    # Load the binary target data for this label
    y_train = pd.read_csv(f"{output_folder}/label_{label_index}.csv", header=None).values.ravel()
    
    # Initialize and train the model
    model = LogisticRegression(max_iter=100)
    model.fit(X_train, y_train)
    
    # Save the model
    model_path = f"{model_folder}/model_{label_index}.pkl"
    joblib.dump(model, model_path)
    
    return model

# Train each model in parallel
Parallel(n_jobs=8)(delayed(train_model_for_label)(i) for i in range(y_train.shape[1]))

print("Training complete. Models saved in the 'models' folder.")

Training complete. Models saved in the 'models' folder.


In [None]:
from sklearn.cluster import DBSCAN

# Initialize and fit DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')  # cosine or other metric if data is sparse
labels = dbscan.fit_predict(X_train)

# Analyze clusters
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # -1 represents noise
print(f"Number of clusters: {num_clusters}")


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import joblib
import os

num_labels = len(unique_codes)  # Total number of labels

# Initialize the StringLookup layer for reverse lookup
lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_codes, invert=True, output_mode="int", mask_token=None, num_oov_indices=0
)

# Folder where models are saved
model_folder = "models"

# Array to store binary predictions for each label
y_pred = np.zeros((X_test.shape[0], num_labels), dtype=int)

# Predict for each label using the corresponding model
for label_index in range(num_labels):
    # Load the saved model for this label
    model_path = f"{model_folder}/model_{label_index}.pkl"
    model = joblib.load(model_path)
    
    # Predict probabilities for the test set and convert to binary labels
    y_pred_probs = model.predict_proba(X_test)[:, 1]  # Take probabilities for the positive class
    y_pred[:, label_index] = (y_pred_probs > 0.5).astype(int)

# Convert binary predictions to ICD-10 codes
predicted_indices = [np.where(pred_row == 1)[0] for pred_row in y_pred]
predicted_codes = [lookup_layer(indices).numpy() for indices in predicted_indices]
predicted_codes = [[code.decode('utf-8') for code in row] for row in predicted_codes]

# Join ICD-10 codes with semicolons for each test instance
predicted_labels = [';'.join(row) for row in predicted_codes]

# Create the final submission DataFrame
submission_df = pd.DataFrame({
    'id': range(1, len(predicted_labels) + 1),
    'labels': predicted_labels
})

# Save to CSV file
submission_df.to_csv('submission.csv', index=False)

print("Predictions saved to 'submission.csv'.")

Predictions saved to 'submission.csv'.


In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Visualize the distribution of the number of labels per sample
num_labels_per_sample = np.sum(y, axis=1)
plt.hist(num_labels_per_sample, bins=range(1, 10))
plt.xlabel("Number of Labels")
plt.ylabel("Frequency")
plt.title("Distribution of Labels per Sample")
plt.show()

# Visualize the most common labels
label_counts = np.sum(y, axis=0)
plt.bar(range(len(label_counts)), label_counts)
plt.xlabel("Labels")
plt.ylabel("Frequency")
plt.title("Label Frequency Distribution")
plt.show()

NameError: name 'y' is not defined

In [1]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0
