**Name**: Anirudh Rao

**Roll No.**: BE21B004



---



In [None]:
import numpy as np
import pandas as pd
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
embeddings1 = np.load('/content/drive/MyDrive/da5401-2024-ml-challenge/embeddings_1.npy')
embeddings2 = np.load('/content/drive/MyDrive/da5401-2024-ml-challenge/embeddings_2.npy')
embeddings = np.concatenate([embeddings1, embeddings2])

In [None]:
with open('/content/drive/MyDrive/da5401-2024-ml-challenge/icd_codes_1.txt', 'rt') as f:
    codes1 = [line.strip('\n').split(';') for line in f.readlines()]
    f.close()

with open('/content/drive/MyDrive/da5401-2024-ml-challenge/icd_codes_2.txt', 'rt') as f:
    codes2 = [line.strip('\n').split(';') for line in f.readlines()]
    f.close()

In [None]:
codes = codes1 + codes2

In [None]:
all_codes = set()

for code_list in codes:
    for code in code_list:
        all_codes.add(code)

all_codes = list(all_codes)
all_codes.sort()

# Exploratory Data Analysis

In [None]:
embeddings.shape

In [None]:
len(all_codes)

In [None]:
alpha_codes = [[x[0] for x in codelist] for codelist in codes]

all_alpha = set()
for code_list in alpha_codes:
    for code in code_list:
        all_alpha.add(code)

all_alpha = list(all_alpha)
all_alpha.sort()

len(all_alpha)

In [None]:
print("-".join(all_alpha))

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
alpha_codes_binarized = mlb.fit_transform(alpha_codes)
alpha_counts = pd.DataFrame(dict(zip(all_alpha, np.sum(alpha_codes_binarized, axis=0))), index=[0]).T.sort_values(by=0, ascending=False)
alpha_counts

In [None]:
encoded_codes = mlb.fit_transform(codes)
counts = pd.DataFrame(dict(zip(all_codes, np.sum(encoded_codes, axis=0))), index=[0]).T.sort_values(by=0, ascending=False)

In [None]:
counts.head(10)

In [None]:
counts.tail(10)

In [None]:
counts.mean()

In [None]:
len(counts[counts[0] == 1])

In [None]:
len(counts[counts[0] <= 100])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(dpi=150)
sns.histplot(counts[0], kde=True)
plt.xlabel("Number of occurrences of ICD code")
plt.ylabel("Frequency")
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_embeddings)

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.figure(dpi=150)
plt.scatter(pca_data[:,0], pca_data[:,1], alpha = 0.25)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
fig, ax = plt.subplots(8,3,dpi=150,figsize=(15,20))

for i in range(len(all_alpha)):

    alphabet = all_alpha[i]

    ax[i//3, i%3].scatter(pca_data[:,0], pca_data[:,1], alpha = 0.25, c = alpha_codes_binarized[:,i])
    ax[i//3, i%3].set_title(alphabet)
    ax[i//3, i%3].set_xlabel('PC1')
    ax[i//3, i%3].set_ylabel('PC2')

plt.tight_layout()
plt.show()

In [None]:
import networkx

code_co_occurrence = networkx.Graph(directed=False, weighted=True)

for code_list in codes:
    for code in code_list:
        if not code_co_occurrence.has_node(code):
        code_co_occurrence.add_node(code)
    for other_code in code_list:
        if not code_co_occurrence.has_edge(code, other_code) and code != other_code:
            code_co_occurrence.add_edge(code, other_code)
            code_co_occurrence[code][other_code]['weight'] = 1
        elif code_co_occurrence.has_edge(code, other_code):
            code_co_occurrence[code][other_code]['weight'] += 1

In [None]:
code_co_occurrence.number_of_nodes()

In [None]:
code_co_occurrence.number_of_edges()

In [None]:
degrees = dict(code_co_occurrence.degree)

In [None]:
np.mean(list(degrees.values()))

In [None]:
top_5_nodes = dict(sorted(degrees.items(), key=lambda item: item[1], reverse=True)[:5])
pd.DataFrame(top_5_nodes, index=[0]).T.sort_values(by=0, ascending=False)

In [None]:
plt.figure(dpi=150)
sns.histplot(list(degrees.values()), kde=True)
plt.xlabel("Degree")
plt.ylabel("Frequency")
plt.show()

In [None]:
nodes_with_0_degree = [node for node in degrees if degrees[node] == 0]
nodes_with_0_degree

In [None]:
edge_weights = {}
for edge in code_co_occurrence.edges:
    edge_weights[edge] = code_co_occurrence[edge[0]][edge[1]]['weight']

top_5_edges = dict(sorted(edge_weights.items(), key=lambda item: item[1], reverse=True)[:5])
pd.DataFrame(top_5_edges, index=[0]).T.sort_values(by=0, ascending=False)

In [None]:
node_colors = {}
for node in code_co_occurrence.nodes():
    first_letter = node[0]
    if first_letter not in node_colors:
        node_colors[first_letter] = (plt.cm.get_cmap('hsv')(len(node_colors) / 26))

node_color_list = [node_colors[node[0]] for node in code_co_occurrence.nodes()]

plt.figure(figsize=(20, 20))
pos = networkx.spring_layout(code_co_occurrence, seed=42, k=1, iterations=100)
networkx.draw(code_co_occurrence, pos, with_labels=True, node_color=node_color_list, node_size=500, font_size=8, alpha=0.7, width=0.5)
plt.show()

# Model Selection

In [None]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
    print("Found GPU at: {}".format(device_name))
else:
    device_name = "/device:CPU:0"
    print("No GPU, using {}.".format(device_name))

In [None]:
X = embeddings
y = encoded_codes

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=5401)

**Model 1 - Decision Tree**

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

model1 = MultiOutputClassifier(DecisionTreeClassifier(class_weight='balanced', max_depth=2))
model1.fit(X_train, y_train)

print(f"Training score: {f1_score(y_train, model1.predict(X_train), average='micro')}")
print(f"Validation score: {f1_score(y_val, model1.predict(X_val), average='micro')}")

**Model 2 - Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

model2 = MultiOutputClassifier(LogisticRegression(class_weight='balanced', max_iter=1000))
model2.fit(X_train, y_train)

print(f"Training score: {f1_score(y_train, model2.predict(X_train), average='micro')}")
print(f"Validation score: {f1_score(y_val, model2.predict(X_val), average='micro')}")

**Model 3 - Single Layer Neural Network**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import F1Score

tf.keras.backend.clear_session()

with tf.device(device_name):
    model3 = Sequential()
    model3.add(Dense(1024, activation='relu'))
    model3.add(Dense(1400, activation='sigmoid'))

    metric = F1Score(average='micro')

    model3.compile(optimizer='adam',loss='binary_crossentropy', metrics=[metric])

early_stopping = EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

history = model3.fit(
    X_train, y_train,
    epochs=200,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
print(f"Training score: {f1_score(y_train, model3.predict(X_train) > 0.5, average='micro')}")
print(f"Validation score: {f1_score(y_val, model3.predict(X_val) > 0.5, average='micro')}")

**Model 4 - Neural Network with Batch Normalization**

In [None]:
from tensorflow.keras.layers import BatchNormalization

tf.keras.backend.clear_session()

with tf.device(device_name):
    model4 = Sequential()
    model4.add(Dense(1024, activation='relu'))
    model4.add(BatchNormalization())
    model4.add(Dense(1400, activation='sigmoid'))

    metric = F1Score(average='micro')

    model4.compile(optimizer='adam',loss='binary_crossentropy', metrics=[metric])

early_stopping = EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

history = model4.fit(
    X_train, y_train,
    epochs=200,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
print(f"Training score: {f1_score(y_train, model4.predict(X_train) > 0.5, average='micro')}")
print(f"Validation score: {f1_score(y_val, model4.predict(X_val) > 0.5, average='micro')}")

**Model 5 - Neural Network with Min-Max Scaling**

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_val = scaler.transform(X_val)

tf.keras.backend.clear_session()

with tf.device(device_name):
    model5 = Sequential()
    model5.add(Dense(1024, activation='relu'))
    model5.add(BatchNormalization())
    model5.add(Dense(1400, activation='sigmoid'))

    metric = F1Score(average='micro')

    model5.compile(optimizer='adam',loss='binary_crossentropy', metrics=[metric])

early_stopping = EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

history = model5.fit(
    scaled_X_train, y_train,
    epochs=200,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
print(f"Training score: {f1_score(y_train, model5.predict(scaled_X_train) > 0.5, average='micro')}")
print(f"Validation score: {f1_score(y_val, model5.predict(scaled_X_val) > 0.5, average='micro')}")

**Model 6 - Neural Network with Additional Layer**

In [None]:
tf.keras.backend.clear_session()

with tf.device(device_name):
    model6 = Sequential()
    model6.add(Dense(1024, activation='relu'))
    model6.add(BatchNormalization())
    model6.add(Dense(512, activation='relu'))
    model6.add(BatchNormalization())
    model6.add(Dense(1400, activation='sigmoid'))

    metric = F1Score(average='micro')

    model6.compile(optimizer='adam',loss='binary_crossentropy', metrics=[metric])

early_stopping = EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

history = model6.fit(
    X_train, y_train,
    epochs=200,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
print(f"Training score: {f1_score(y_train, model6.predict(X_train) > 0.5, average='micro')}")
print(f"Validation score: {f1_score(y_val, model6.predict(X_val) > 0.5, average='micro')}")

# Final Prediction

In [None]:
test_embeddings = np.load('/content/drive/MyDrive/da5401-2024-ml-challenge/test_data.npy')

In [None]:
test_predictions = model4.predict(test_embeddings)

In [None]:
binary_test_predictions = (test_predictions > 0.5).astype(int)

In [None]:
codes_predicted = [";".join(sorted([all_codes[index] for index in range(len(prediction)) if prediction[index]==1])) for prediction in binary_predictions]

In [None]:
pd.DataFrame(codes_predicted, columns=['labels'], index=range(1,len(codes_predicted)+1)).reset_index().rename(columns={'index':'id'}).to_csv('be21b004_prediction.csv', index=False)