<a href="https://colab.research.google.com/github/salman7636/Clinical-Notes-Classification/blob/main/notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Deep Learning Clinical Notes Classification
# Name: Salman Firdous

import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib


# 1. Load dataset

data = pd.read_csv("clinical_notes_final_dataset.csv")



# 2. Text cleaning


def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data["clean_text"] = data["text"].apply(clean_text)



# 3. Label Encoding


label_encoder = LabelEncoder()
data["encoded_label"] = label_encoder.fit_transform(data["label"])

joblib.dump(label_encoder, "label_encoder.pkl")

print("\nClass Distribution:")
print(data["encoded_label"].value_counts())


# 4. Train-Test Split


X = data["clean_text"]
y = data["encoded_label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("\nTrain size:", len(X_train))
print("Test size:", len(X_test))


# 5. Tokenization


tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

joblib.dump(tokenizer, "tokenizer.pkl")

vocab_size = min(5000, len(tokenizer.word_index) + 1)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 80

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

# 6. LSTM Model


model = Sequential([
    Embedding(vocab_size, 128),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation="softmax")
])


model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()



# 7. Train model


history = model.fit(
    X_train_pad,
    y_train,
    epochs=25,
    batch_size=4,
    validation_split=0.2,
    verbose=1
)


# 8. Evaluation


y_pred = np.argmax(model.predict(X_test_pad), axis=1)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, zero_division=0))


# 9. Save model (NEW FORMAT)


model.save("clinical_notes_lstm_model.keras")
print("\nModel saved successfully as clinical_notes_lstm_model.keras")


# 10. Prediction Function


def predict_note(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(pad)
    label = label_encoder.inverse_transform([np.argmax(pred)])
    return label[0]

# Example test
example = "patient advised insulin for diabetes management"
print("\nExample Prediction:", predict_note(example))



Class Distribution:
encoded_label
0    100
1    100
3    100
4    100
2    100
Name: count, dtype: int64

Train size: 350
Test size: 150


Epoch 1/25
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 60ms/step - accuracy: 0.2564 - loss: 1.6092 - val_accuracy: 0.4714 - val_loss: 1.5598
Epoch 2/25
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 65ms/step - accuracy: 0.5114 - loss: 1.4831 - val_accuracy: 0.9143 - val_loss: 0.6408
Epoch 3/25
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step - accuracy: 0.8646 - loss: 0.4909 - val_accuracy: 1.0000 - val_loss: 0.0305
Epoch 4/25
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 77ms/step - accuracy: 1.0000 - loss: 0.0334 - val_accuracy: 1.0000 - val_loss: 0.0067
Epoch 5/25
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - accuracy: 1.0000 - loss: 0.0235 - val_accuracy: 1.0000 - val_loss: 0.0024
Epoch 6/25
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 1.0000 - loss: 0.0064 - val_accuracy: 1.0000 - val_loss: 0.0013
Epoch 7/25
[1m70/70[0m [32m━━━━