In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the CSV file
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    texts = df['text'].astype(str).tolist()
    labels = df['label'].astype(str).tolist()
    return texts, labels

# Preprocess the data
def preprocess_data(texts, labels, num_words=10000, max_len=100):
    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    return padded_sequences, np.array(encoded_labels), tokenizer, label_encoder


In [4]:
csv_path = '/content/medical_data_large.csv'
texts, labels = load_data(csv_path)

X, y, tokenizer, label_encoder = preprocess_data(texts, labels)

print("Texts:", texts[:2])
print("Encoded Labels:", y[:2])
print("Vocabulary Size:", len(tokenizer.word_index))
print("Classes:", label_encoder.classes_)


Texts: ['Experiencing shortness of breath', 'WBC count is abnormally high']
Encoded Labels: [0 1]
Vocabulary Size: 110
Classes: ['symptom' 'test' 'treatment']


In [5]:
#Model Training
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

model = Sequential([
    Embedding(input_dim=10000, output_dim=16, input_length=100),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, validation_split=0.2)


Epoch 1/100




[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.3882 - loss: 1.0980 - val_accuracy: 0.2619 - val_loss: 1.1053
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.3545 - loss: 1.0971 - val_accuracy: 0.2619 - val_loss: 1.1046
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.3524 - loss: 1.0982 - val_accuracy: 0.3333 - val_loss: 1.1043
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.4016 - loss: 1.0967 - val_accuracy: 0.2619 - val_loss: 1.1016
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.3520 - loss: 1.0973 - val_accuracy: 0.2619 - val_loss: 1.0992
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.3497 - loss: 1.0955 - val_accuracy: 0.2619 - val_loss: 1.0991
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x7a8fe9add4d0>

In [6]:
#Save
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)


In [7]:
model.save("Medical text Classifier.h5")



In [8]:
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the saved model
model = tf.keras.models.load_model("Medical text Classifier.h5")

# Load the tokenizer and label encoder
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

with open('label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Define a function to predict the class of new text
def predict_medical_class(text, model, tokenizer, label_encoder, max_len=100):
    # Preprocess the new text
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')

    # Make the prediction
    prediction = model.predict(padded_sequence)
    predicted_class_index = tf.argmax(prediction, axis=1).numpy()[0]
    predicted_class = label_encoder.inverse_transform([predicted_class_index])[0]

    return predicted_class

# Example usage:
new_text = "Patient is experiencing chest pain and shortness of breath."
predicted_class = predict_medical_class(new_text, model, tokenizer, label_encoder)
print(f"The predicted medical class for the text '{new_text}' is: {predicted_class}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
The predicted medical class for the text 'Patient is experiencing chest pain and shortness of breath.' is: symptom
