In [26]:
import numpy as np
from nltk.tokenize import WordPunctTokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string
import nltk

In [17]:
nltk.download('punkt')
data = pd.read_csv(r'D:\\Code\\Projects\\medbuddyAPI\\datasets\\symptom_checker\\disease_sympts_prec_full.csv')
data['symptoms'] = data['symptoms'].apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x.lower()) if word not in string.punctuation]))
tokenized_texts = data['symptoms'].tolist()
symptom_vocab = set()
for text in tokenized_texts:
    symptom_vocab.update(text.split())
symptom_vocab_size = len(symptom_vocab) + 1
word_to_index = {word: index for index, word in enumerate(symptom_vocab, start=1)}
sequences = [[word_to_index[word] for word in text.split()] for text in tokenized_texts]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
X = []
y = []
for seq in sequences:
    for i in range(1, len(seq)):
        X.append(seq[:i])
        y.append(seq[i])

In [19]:
max_seq_len = max(len(seq) for seq in X)
X = np.array(pad_sequences(X, maxlen=max_seq_len))
y = np.array(to_categorical(y, num_classes=symptom_vocab_size))

In [22]:
model = Sequential()
model.add(Embedding(symptom_vocab_size, 64, input_length=max_seq_len))
model.add(LSTM(72))
model.add(Dense(symptom_vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
early_stop = EarlyStopping(monitor='val_loss', patience=10)

In [24]:
model.fit(X, y, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


<keras.callbacks.History at 0x1a7208fdff0>

In [10]:
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Model Accuracy: 93.15%


In [28]:
import pickle
with open("nextSymptomData.pickle", "wb") as f:
    pickle.dump((word_to_index, max_seq_len), f)

In [25]:
new_sequence = "itching, skin_rashs, headache"
new_sequence = new_sequence.lower()
tokenized_sequence = [word for word in nltk.word_tokenize(new_sequence) if word not in string.punctuation]
sequence_indices = [word_to_index.get(word, 0) for word in tokenized_sequence]
padded_sequence = pad_sequences([sequence_indices], maxlen=max_seq_len)

prediction = model.predict(padded_sequence)
next_symptom_index = np.argmax(prediction[0])
next_symptom_word = list(word_to_index.keys())[list(word_to_index.values()).index(next_symptom_index)]

print("Given the sequence: ", new_sequence)
print("The predicted next symptom is: ", next_symptom_word)

Given the sequence:  itching, skin_rashs, headache
The predicted next symptom is:  nausea


In [27]:
# Convert the Keras model to a TensorFlow Lite model file
converter= tf.lite.TFLiteConverter.from_keras_model(model)
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
tflite_model = converter.convert()


# Save the TensorFlow Lite model to a file
with open('diseasePredV1.tflite', 'wb') as f:
    f.write(tflite_model)



INFO:tensorflow:Assets written to: C:\Users\Rohan\AppData\Local\Temp\tmpg1d58gh_\assets


INFO:tensorflow:Assets written to: C:\Users\Rohan\AppData\Local\Temp\tmpg1d58gh_\assets
