### Defining helper functions

In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def introduce():
    print("Hello there! I'm here to help you with your health queries. Type bye to end conversation.")

def get_user_input():
    return input("You: ")

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import string, os
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# set seeds for reproducability
import tensorflow as tf

# Set random seed
tf.random.set_seed(42)

# keras module for building LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

### Loading Webscrapped Data

In [None]:
# Function to load text data from file
def load_text_data(bucket_file):
    with open(bucket_file, 'r', encoding='utf-8') as file:
        text_data = file.readlines()
    return text_data

# Load text data for each bucket
diabetes_data = load_text_data('diabetes.txt')
bp_data = load_text_data('bp.txt')
heart_data = load_text_data('heart.txt')

In [None]:
len(diabetes_data)

76

In [None]:
diabetes_data

['Diabetes is a common condition that affects people of all ages. There are several forms of diabetes. Type 2 is the most common. A combination of treatment strategies can help you manage the condition to live a healthy life and prevent complications.\n',
 'Diabetes is a condition that happens when your blood sugar (glucose) is too high. It develops when your pancreas doesn’t make enough insulin or any at all, or when your body isn’t responding to the effects of insulin properly. Diabetes affects people of all ages. Most forms of diabetes are chronic (lifelong), and all forms are manageable with medications and/or lifestyle changes.\n',
 'Glucose (sugar) mainly comes from carbohydrates in your food and drinks. It’s your body’s go-to source of energy. Your blood carries glucose to all your body’s cells to use for energy.\n',
 'When glucose is in your bloodstream, it needs help — a “key” — to reach its final destination. This key is insulin (a hormone). If your pancreas isn’t making enou

In [None]:
len(bp_data)

34

In [None]:
bp_data

['Blood pressure is the pressure of blood pushing against the walls of your arteries. Arteries carry blood from your heart to other parts of your body. \n',
 'Your blood pressure normally rises and falls throughout the day. Blood pressure is measured using two numbers: \n',
 'The first number, called systolic blood pressure, measures the pressure in your arteries when your heart beats. The second number, called diastolic blood pressure, measures the pressure in your arteries when your heart rests between beats. \n',
 'If the measurement reads 120 systolic and 80 diastolic, you would say, “120 over 80,” or write, “120/80 mmHg.” A normal blood pressure level is less than 120/80 mmHg.1 \n',
 '\n',
 'No matter your age, you can take steps each day to keep your blood pressure in a healthy range. High blood pressure, also called hypertension, is blood pressure that is higher than normal. \n',
 'Your blood pressure changes throughout the day based on your activities. Having blood pressure mea

In [None]:
len(heart_data)

42

In [None]:
heart_data

['An estimated 17.9 million people died from CVDs in 2019, representing 32% of all global deaths. Of these deaths, 85% were due to heart attack and stroke. \n',
 'Over three quarters of CVD deaths take place in low- and middle-income countries. Out of the 17 million premature deaths (under the age of 70) due to noncommunicable diseases in 2019, 38% were caused by CVDs.\n',
 '\n',
 'Most cardiovascular diseases can be prevented by addressing behavioural risk factors such as tobacco use, unhealthy diet and obesity, physical inactivity and harmful use of alcohol. \n',
 'It is important to detect cardiovascular disease as early as possible so that management with counselling and medicines can begin. Cardiovascular diseases (CVDs) are a group of disorders of the heart and blood vessels. \n',
 'They include: coronary heart disease – a disease of the blood vessels supplying the heart muscle; cerebrovascular disease – a disease of the blood vessels supplying the brain; peripheral arterial dise

### Cleaning the Data

In [None]:
import string
import nltk
from nltk.tokenize import sent_tokenize

def clean_text(txt):
    # Remove punctuation
    txt = "".join(t for t in txt if t not in string.punctuation)

    # Remove newline characters
    txt = txt.replace("\n", " ")

    # Convert to lowercase
    txt = txt.lower()

    # Remove non-ASCII characters
    txt = txt.encode("utf8").decode("ascii", "ignore")

    return txt

def preprocess_paragraph(paragraph):
    sentences = sent_tokenize(paragraph)  # Split paragraph into sentences
    cleaned_sentences = [clean_text(sentence) for sentence in sentences]  # Clean each sentence
    return cleaned_sentences

def preprocess_corpus(corpus):
    preprocessed_sentences = [preprocess_paragraph(paragraph) for paragraph in corpus]
    flattened_sentences = [sentence for paragraph in preprocessed_sentences for sentence in paragraph]
    return flattened_sentences

diabetes_corpus = preprocess_corpus(diabetes_data)
heart_corpus = preprocess_corpus(heart_data)
bp_corpus = preprocess_corpus(bp_data)

In [None]:
diabetes_corpus[1]

'there are several forms of diabetes'

In [None]:
len(diabetes_corpus[1])

35

In [None]:
diabetes_corpus[:10]

['diabetes is a common condition that affects people of all ages',
 'there are several forms of diabetes',
 'type 2 is the most common',
 'a combination of treatment strategies can help you manage the condition to live a healthy life and prevent complications',
 'diabetes is a condition that happens when your blood sugar glucose is too high',
 'it develops when your pancreas doesnt make enough insulin or any at all or when your body isnt responding to the effects of insulin properly',
 'diabetes affects people of all ages',
 'most forms of diabetes are chronic lifelong and all forms are manageable with medications andor lifestyle changes',
 'glucose sugar mainly comes from carbohydrates in your food and drinks',
 'its your bodys goto source of energy']

In [None]:
len(diabetes_corpus)

154

### Generating n-gram sequence for LSTM training

In [None]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    # tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    # convert data to a token sequence
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

diabetes_sequences, total_words = get_sequence_of_tokens(diabetes_corpus)
heart_sequences, total_words = get_sequence_of_tokens(heart_corpus)
bp_sequences, total_words = get_sequence_of_tokens(bp_corpus)

In [None]:
diabetes_sequences[:20]

[[1, 12],
 [1, 12, 8],
 [1, 12, 8, 52],
 [1, 12, 8, 52, 34],
 [1, 12, 8, 52, 34, 25],
 [1, 12, 8, 52, 34, 25, 35],
 [1, 12, 8, 52, 34, 25, 35, 21],
 [1, 12, 8, 52, 34, 25, 35, 21, 5],
 [1, 12, 8, 52, 34, 25, 35, 21, 5, 41],
 [1, 12, 8, 52, 34, 25, 35, 21, 5, 41, 189],
 [58, 17],
 [58, 17, 53],
 [58, 17, 53, 86],
 [58, 17, 53, 86, 5],
 [58, 17, 53, 86, 5, 1],
 [9, 26],
 [9, 26, 12],
 [9, 26, 12, 6],
 [9, 26, 12, 6, 59],
 [9, 26, 12, 6, 59, 52]]

### Padding the sequences

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

diabetes_predictors, diabetes_label, diabetes_max_sequence_len = generate_padded_sequences(diabetes_sequences)
heart_predictors, heart_label, heart_max_sequence_len = generate_padded_sequences(heart_sequences)
bp_predictors, bp_label, bp_max_sequence_len = generate_padded_sequences(bp_sequences)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.metrics import Accuracy

def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    # ----------Add Input Embedding Layer
    model.add(Embedding(total_words, 100, input_length=input_len))
    # ----------Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    # ----------Add Output Layer
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[Accuracy()])
    return model

### Training Diabetes Model

In [None]:
diabetes_model = create_model(diabetes_max_sequence_len, total_words)
diabetes_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 100)           99100     
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 991)               100091    
                                                                 
Total params: 279591 (1.07 MB)
Trainable params: 279591 (1.07 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
history = diabetes_model.fit(diabetes_predictors, diabetes_label, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Training Heart Model

In [None]:
heart_model = create_model(heart_max_sequence_len, total_words)
heart_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 103, 100)          99100     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 991)               100091    
                                                                 
Total params: 279591 (1.07 MB)
Trainable params: 279591 (1.07 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
history = heart_model.fit(heart_predictors, heart_label, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Training Hypertension Model

In [None]:
bp_model = create_model(bp_max_sequence_len, total_words)
bp_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 38, 100)           99100     
                                                                 
 lstm_2 (LSTM)               (None, 100)               80400     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 991)               100091    
                                                                 
Total params: 279591 (1.07 MB)
Trainable params: 279591 (1.07 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
history = bp_model.fit(bp_predictors, bp_label, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Making Predictions given User query

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Predict probabilities for each word
        predicted_probs = model.predict(token_list, verbose=0)[0]

        # Get the index of the word with the highest probability
        predicted_index = np.argmax(predicted_probs)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word

    return seed_text

### Evaluation Metrics

In [2]:
from sklearn.metrics import accuracy_score

# function to calculate accuracy
def calculate_accuracy(predictors, labels, model):
  # predict probabilities for each class
  predicted_probs = model.predict(predictors, verbose=0)

  # get the predicted class labels
  predicted_labels = np.argmax((predicted_probs, axis=1))

  # convert one-hot-encoded labels back to integers
  true_labels = np.argmax(labels, axis=1)

  # calculate accuracy
  acc = accuracy_score(true_labels, predicted_labels)

  return acc

# calculate accuracy for each model
diabetes_accuracy = calculate_accuracy(diabetes_predictors, diabetes_label, diabetes_model)
heart_accuracy = calculate_accuracy(heart_predictors, heart_label, heart_model)
bp_accuracy = calculate_accuracy(bp_predictors, bp_label, bp_model)

print("Diabetes Model Accuracy:",diabetes_accuracy)
print("Heart Health Model Accuracy:",heart_accuracy)
print("Hypertension Model Accuracy:",bp_accuracy)


Diabetes Model Accuracy: 0.805
Heart Health Model Accuracy: 0.782
Hypertension Model Accuracy: 0.812


### Chatbot Skeleton

In [1]:
def classify_question(user_input):
    # Preprocess the user input
    preprocessed_input = preprocess_text(user_input)

    # Predefined keywords for each bucket
    keywords = {
        "diabetes": ["diabetes","glucagon","sugar", "insulin","glucose","diabetic","hyperglycemia","hypoglycemia"],
        "heart health": ["heart", "cardiovascular","coronary","angina","arrhythmia","cardiac","arteries","veins","aorta","myocardial","infarction"],
        "hypertension": ["hypertension","blood","pressure", "bp","hypertensive"]
    }

    # Check for keywords in the preprocessed input
    for bucket, bucket_keywords in keywords.items():
        for keyword in bucket_keywords:
            if keyword in preprocessed_input:
                return bucket, preprocessed_input

    # Fallback mechanism if no keywords are found
    return "other", preprocessed_input

def form_answer(user_input,intent):
  l = ""
  for i in user_input:
    l+=i

  if intent == "diabetes":
    return generate_text(l, 20, diabetes_model, diabetes_max_sequence_len)
  elif intent == "heart health":
    return generate_text(l, 20, heart_model, heart_max_sequence_len)
  elif intent == "hypertension":
    return generate_text(l, 20, bp_model, bp_max_sequence_len)

def generate_response(user_input):
    # Classify the user input
    intent, processed_input = classify_question(user_input)

    # Generate response based on the intent
    if intent == "diabetes":
        response = form_answer(processed_input,intent)
    elif intent == "heart health":
        response = form_answer(processed_input,intent)
    elif intent == "hypertension":
        response = form_answer(processed_input,intent)
    else:
        response = "I'm sorry, I couldn't understand your query."

    return response

introduce()
while True:
    user_input = get_user_input()
    if user_input.lower() == "bye":
        print("Bot: Goodbye! Take care.")
        break
    response = generate_response(user_input)
    print("Bot:", response)


Hello there! I'm here to help you with your health queries. Type bye to end conversation.
You: what are the sympotoms of ddiabtes?
Bot: I'm sorry, I couldn't understand your query.
You: what are the symptoms of diabetes?
Bot: symptoms of diabetes a include increased the thirst dry mouth frequent urination is is fatigue blurred vision unexplained weight loss numbness or tingling in your hands or feet slow-healing sores or cuts
You: why is high blood pressure called a "silent killer"?
You: define insulin resistance for me.
Bot: type two diabetes mainly fruits results from insulin resistance which happen when cells in muscles fat and liver don t respond as they should to to insulin
You: How to prevent heart disease?
Bot: most heart diseases can be cardiovascular prevented by addressing behavioural the the risk factors such as tobacco a no use unhealthy diet and or obesity physical inactivity and harmful use of alcohol
You: what happens during a stroke?
Bot: the most common symptom of str