<a href="https://colab.research.google.com/github/pontonkid/HealthDiagnoSys/blob/main/Healthcare_predictor_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Health Diagnosis System**

An AI-powered healthcare diagnosis assistant for real-time disease predictions. Leverages natural language processing and machine learning to guide users based on their symptoms.

***Installing the libraries***

In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install tensorflow
!pip install keras
!pip install torch
!pip install transformers
!pip install nltk
!pip install gensim



**Importing the libraries**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import seaborn as sns


# for using TensorFlow for deep learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy

# for using PyTorch for deep learning
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


**Loading & exploring the data**

In [None]:
# Specify the path to the dataset
data_path = "/content/Symptom2Disease.csv"

# Load the dataset into a pandas DataFrame
data = pd.read_csv(data_path)

# Display the first few rows of the dataset to check if it loaded correctly
data.head()


Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [None]:
# Data Information
# Display information about the dataset, including columns, data types, and non-null counts
print("\nDataset Information:")
data.info()



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1200 non-null   int64 
 1   label       1200 non-null   object
 2   text        1200 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.2+ KB


In [None]:
# Count the occurrences of each disease
disease_counts = data['label'].value_counts()
print("Number of Each Disease:")
print(disease_counts)


Number of Each Disease:
Psoriasis                          50
Varicose Veins                     50
peptic ulcer disease               50
drug reaction                      50
gastroesophageal reflux disease    50
allergy                            50
urinary tract infection            50
Malaria                            50
Jaundice                           50
Cervical spondylosis               50
Migraine                           50
Hypertension                       50
Bronchial Asthma                   50
Acne                               50
Arthritis                          50
Dimorphic Hemorrhoids              50
Pneumonia                          50
Common Cold                        50
Fungal infection                   50
Dengue                             50
Impetigo                           50
Chicken pox                        50
Typhoid                            50
diabetes                           50
Name: label, dtype: int64


In [None]:
# Missing Values
# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values:")
print(missing_values)



Missing Values:
Unnamed: 0    0
label         0
text          0
dtype: int64


**Data Preprocessing**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Initialize the NLTK stopwords and stemmer
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Text Preprocessing Function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenization (split the text into words)
    words = text.split()

    # Removing Punctuation and Stop Words, and Applying Stemming
    words = [stemmer.stem(word) for word in words if word.isalnum() and word not in stop_words]

    # Rejoin the words into a processed text
    processed_text = ' '.join(words)

    return processed_text

# Apply text preprocessing to the 'text' column of the dataset
data['text'] = data['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import nltk

# Download the stopwords resource
nltk.download('stopwords', quiet=True)


True

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply TF-IDF vectorization to the preprocessed text data
X = tfidf_vectorizer.fit_transform(data['text'])

# X is now the numerical feature matrix
# You can also access feature names using tfidf_vectorizer.get_feature_names()


**Splitting the data**

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# X_train: Training feature matrix
# X_test: Testing feature matrix
# y_train: Training labels
# y_test: Testing labels


**Model Training**

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)


**Evaluation**

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report
classification_rep = classification_report(y_test, y_pred)
print('\nClassification Report:\n', classification_rep)

# Generate a confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print('\nConfusion Matrix:\n', confusion_mat)


Accuracy: 0.92

Classification Report:
                                  precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                      Arthritis       0.83      1.00      0.91        10
               Bronchial Asthma       0.85      1.00      0.92        11
           Cervical spondylosis       0.88      1.00      0.93         7
                    Chicken pox       0.92      1.00      0.96        12
                    Common Cold       1.00      1.00      1.00        12
                         Dengue       0.91      0.83      0.87        12
          Dimorphic Hemorrhoids       0.78      1.00      0.88         7
               Fungal infection       1.00      1.00      1.00        13
                   Hypertension       1.00      1.00      1.00        10
                       Impetigo       1.00      1.00      1.00        11
                       Jaundice       0.92      1.00      0.96        11
          

**BUILDING THE SYSTEM**

Now let's build our system. First we'll code up functions for processing user input and making disease predictions with our model.

Then we'll use Streamlit to create a web app interface to display the predictions.

We'll take it step-by-step - first the prediction functions, then the Streamlit UI. This will give us an end-to-end disease prediction system.

**Writting the fucnting for handling user input**

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def preprocess_input(user_input):
    # Convert to lowercase
    user_input = user_input.lower()
    # Remove special characters and numbers
    user_input = re.sub(r"[^a-zA-Z\s]", "", user_input)
    # Tokenize the input
    user_tokens = user_input.split()
    # Remove stopwords
    user_tokens = [word for word in user_tokens if word not in stop_words]
    # Join the tokens back into a clean text
    user_clean_text = " ".join(user_tokens)
    return user_clean_text

# Example usage:
user_input = "I have a fever and cough"
cleaned_input = preprocess_input(user_input)
print(cleaned_input)


fever cough


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Model Prediction Function**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply TF-IDF vectorization to the preprocessed text data
X = tfidf_vectorizer.fit_transform(data['text'])

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Function to predict diseases based on user input
def predict_diseases(user_clean_text):
    # Vectorize the cleaned user input using the same TF-IDF vectorizer
    user_input_vector = tfidf_vectorizer.transform([user_clean_text])
    # Make predictions using the trained model
    predictions = model.predict(user_input_vector)
    return predictions

# Example usage:
user_clean_text = "fever cough"  # This should be the output of your User Input Handling Function
predictions = predict_diseases(user_clean_text)
print(predictions)


['Bronchial Asthma']


**Building the User Interface**

In [None]:
import streamlit as st
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Load your symptom-disease data
data = pd.read_csv("Symptom2Disease.csv")

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply TF-IDF vectorization to the preprocessed text data
X = tfidf_vectorizer.fit_transform(data['text'])

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Set Streamlit app title with emojis
st.title("Health Symptom-to-Disease Predictor 🏥👨‍⚕️")

# Define a sidebar
st.sidebar.title("Tool Definition")
st.sidebar.markdown("This tool helps you identify possible diseases based on the symptoms you provide. It is not a substitute for professional medical advice. Always consult a healthcare professional for accurate diagnosis and treatment.")

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = []

# Function to preprocess user input
def preprocess_input(user_input):
    user_input = user_input.lower()  # Convert to lowercase
    user_input = re.sub(r"[^a-zA-Z\s]", "", user_input)  # Remove special characters and numbers
    user_input = " ".join(user_input.split())  # Remove extra spaces
    return user_input

# Function to predict diseases based on user input
def predict_diseases(user_clean_text):
    user_input_vector = tfidf_vectorizer.transform([user_clean_text])  # Vectorize the cleaned user input
    predictions = model.predict(user_input_vector)  # Make predictions using the trained model
    return predictions

# Add user input section
user_input = st.text_area("Enter your symptoms (how you feel):", key="user_input")

# Add button to predict disease
if st.button("Predict Disease"):
    # Display loading message
    with st.spinner("Diagnosing patient..."):
        # Check if user input is not empty
        if user_input:
            cleaned_input = preprocess_input(user_input)
            predicted_diseases = predict_diseases(cleaned_input)

            # Display predicted diseases
            st.session_state.messages.append({"role": "user", "content": user_input})
            st.session_state.messages.append({"role": "assistant", "content": f"Based on your symptoms, you might have {', '.join(predicted_diseases)}."})

            st.write("Based on your symptoms, you might have:")
            for disease in predicted_diseases:
                st.write(f"- {disease}")
        else:
            st.warning("Please enter your symptoms before predicting.")

# Display a warning message
st.warning("Please note that this tool is for informational purposes only. Always consult a healthcare professional for accurate medical advice.")

# Add attribution
st.markdown("Created with ❤️ by Joas")


In [None]:
import streamlit as st
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Load your symptom-disease data
data = pd.read_csv("Symptom2Disease.csv")

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply TF-IDF vectorization to the preprocessed text data
X = tfidf_vectorizer.fit_transform(data['text'])

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Set Streamlit app title with emojis
st.title("Health Symptom-to-Disease Predictor 🏥👨‍⚕️")

# Define a sidebar
st.sidebar.title("Tool Definition")
st.sidebar.markdown("This tool helps you identify possible diseases based on the symptoms you provide. It is not a substitute for professional medical advice. Always consult a healthcare professional for accurate diagnosis and treatment.")

# Create a checkbox for FAQs
show_faqs = st.sidebar.checkbox("Frequently Asked Questions")

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = []

# Function to preprocess user input
def preprocess_input(user_input):
    user_input = user_input.lower()  # Convert to lowercase
    user_input = re.sub(r"[^a-zA-Z\s]", "", user_input)  # Remove special characters and numbers
    user_input = " ".join(user_input.split())  # Remove extra spaces
    return user_input

# Function to predict diseases based on user input
def predict_diseases(user_clean_text):
    user_input_vector = tfidf_vectorizer.transform([user_clean_text])  # Vectorize the cleaned user input
    predictions = model.predict(user_input_vector)  # Make predictions using the trained model
    return predictions

# Add user input section
user_input = st.text_area("Enter your symptoms (how you feel):", key="user_input")

# Add button to predict disease
if st.button("Predict Disease"):
    # Display loading message
    with st.spinner("Diagnosing patient..."):
        # Check if user input is not empty
        if user_input:
            cleaned_input = preprocess_input(user_input)
            predicted_diseases = predict_diseases(cleaned_input)

            # Display predicted diseases
            st.session_state.messages.append({"role": "user", "content": user_input})
            st.session_state.messages.append({"role": "assistant", "content": f"Based on your symptoms, you might have {', '.join(predicted_diseases)}."})

            st.write("Based on your symptoms, you might have:")
            for disease in predicted_diseases:
                st.write(f"- {disease}")
        else:
            st.warning("Please enter your symptoms before predicting.")

# Create FAQs section
if show_faqs:
    st.markdown("## Frequently Asked Questions")
    st.markdown("**Q: How does this tool work?**")
    st.markdown("A: The tool uses a machine learning model to analyze the symptoms you enter and predicts possible diseases based on a pre-trained dataset.")

    st.markdown("**Q: Is this a substitute for a doctor's advice?**")
    st.markdown("A: No, this tool is for informational purposes only. It's essential to consult a healthcare professional for accurate medical advice.")

    st.markdown("**Q: Can I trust the predictions?**")
    st.markdown("A: While the tool provides predictions, it's not a guarantee of accuracy. It's always best to consult a healthcare expert for a reliable diagnosis.")
