In [21]:
import tensorflow as tf
import numpy as np
import pandas as pd
from glob import glob
import os

# Specify the current directory as the location for both reading and saving files
file_directory = "."  # This points to the folder containing both the notebook and CSV files
file_paths = glob(os.path.join(file_directory, "*.csv"))
predictions = [pd.read_csv(file) for file in file_paths]

In [22]:
predictions

[          id                          labels
 0          1                          G56.21
 1          2                  M65.9;S83.242A
 2          3                          G56.01
 3          4                         M65.312
 4          5               S83.241A;S83.281A
 ...      ...                             ...
 99485  99486  D12.0;D12.5;K57.30;K63.5;K64.9
 99486  99487      K20.90;K29.50;K31.89;K90.0
 99487  99488        D12.2;D12.5;K64.8;Z12.11
 99488  99489             B96.81;K21.9;K29.50
 99489  99490              D12.2;D12.3;Z12.11
 
 [99490 rows x 2 columns],
           id                           labels
 0          1                           G56.21
 1          2                   M65.9;S83.242A
 2          3                           G56.01
 3          4                          M65.312
 4          5                S83.241A;S83.281A
 ...      ...                              ...
 99485  99486   D12.0;D12.5;K57.30;K63.5;K64.9
 99486  99487      K20.90;K29.50;K31.89;Z87

In [23]:
# Extract unique ICD10 codes, handling cases where labels might be missing
unique_labels = set()
for pred in predictions:
    if 'labels' in pred.columns:
        labels = pred['labels'].fillna('').str.split(';')
        for label_list in labels:
            unique_labels.update(label_list)

# Check if unique_labels is not empty
if not unique_labels:
    raise ValueError("No labels found in any of the CSV files. Please ensure your files contain label data.")

unique_codes = sorted(unique_labels)
lookup_layer = tf.keras.layers.StringLookup(vocabulary=unique_codes, mask_token=None, num_oov_indices=0)

In [24]:
# Helper function to convert labels back to binary array (y values), filling missing predictions with zeroes
def labels_to_binary(pred_df, lookup_layer):
    y_matrix = np.zeros((len(pred_df), len(unique_codes)), dtype=int)
    for i, label_str in enumerate(pred_df['labels'].fillna('')):  # Fill NaN with empty strings
        if label_str:
            codes = label_str.split(';')
            indices = lookup_layer(codes).numpy()
            y_matrix[i, indices] = 1
    return y_matrix

# Convert each prediction DataFrame to binary matrices
binary_predictions = [labels_to_binary(pred, lookup_layer) for pred in predictions]
stacked_preds = np.stack(binary_predictions, axis=0)

In [25]:
# Ensemble methods
def majority_voting(preds):
    binary_preds = (preds > 0.5).astype(int)
    majority_vote = np.mean(binary_preds, axis=0) >= 0.5
    return majority_vote.astype(int)

def weighted_average(preds, weights):
    weighted_preds = np.average(preds, axis=0, weights=weights)
    return (weighted_preds > 0.5).astype(int)

def max_pooling(preds):
    max_preds = np.max(preds, axis=0)
    return (max_preds > 0.5).astype(int)

def threshold_adjustment(preds, threshold=0.4):
    avg_preds = np.mean(preds, axis=0)
    return (avg_preds > threshold).astype(int)

In [26]:
# Apply ensemble methods
majority_vote_ensemble = majority_voting(stacked_preds)
weights = np.ones(len(predictions)) / len(predictions)  # Adjust weights if needed
weighted_avg_ensemble = weighted_average(stacked_preds, weights)
max_pooling_ensemble = max_pooling(stacked_preds)

In [27]:
threshold_adjusted_ensemble = threshold_adjustment(stacked_preds)

In [28]:
weighted_avg_ensemble.shape

(99490, 1000)

In [29]:
# Prepare reverse lookup
lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_codes, invert=True, output_mode="int", mask_token=None, num_oov_indices=0
)

In [None]:
# Convert predictions to ICD10 codes
predicted_indices = [np.where(pred_row == 1)[0] for pred_row in threshold_adjusted_ensemble]
predicted_codes = [lookup_layer(indices).numpy() for indices in predicted_indices]
predicted_codes = [[code.decode('utf-8') for code in row] for row in predicted_codes]
predicted_labels = [';'.join(row) for row in predicted_codes]

# Create and save submission DataFrame
submission_df = pd.DataFrame({
    'id': range(1, len(predicted_labels) + 1),
    'labels': predicted_labels
})
submission_df.to_csv('C:\\Users\\sakth\\OneDrive\\Desktop\\SEMESTER 7\\DATA ANALYTICS LAB\\DATA CHALLENGE\\da5401-2024-ml-challenge\\submission_srini.csv', index=False)

: 

In [None]:
from sklearn.metrics import classification_report

# Convert predictions to ICD10 codes    
predicted_indices = [np.where(pred_row == 1)[0] for pred_row in weighted_avg_ensemble]
predicted_codes = [lookup_layer(indices).numpy() for indices in predicted_indices]
predicted_codes = [[code.decode('utf-8') for code in row] for row in predicted_codes]
predicted_labels = [';'.join(row) for row in predicted_codes]   
