In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import requests
import os

In [2]:
# Load datasets (paths in your Colab environment)
description_df = pd.read_csv('C:\\Users\\Rishitha\\Downloads\\ML\\description.csv')
diets_df = pd.read_csv('C:\\Users\\Rishitha\\Downloads\\ML\\diets.csv')
medication_df = pd.read_csv('C:\\Users\\Rishitha\\Downloads\\ML\\medications.csv')
precautions_df = pd.read_csv('C:\\Users\\Rishitha\\Downloads\\ML\\precautions_df.csv')
symptom_severity_df = pd.read_csv('C:\\Users\\Rishitha\\Downloads\\ML\\Symptom-severity.csv')
symptoms_df = pd.read_csv('C:\\Users\\Rishitha\\Downloads\\ML\\symtoms_df.csv')
training_df = pd.read_csv('C:\\Users\\Rishitha\\Downloads\\ML\\Training.csv')
workout_df = pd.read_csv('C:\\Users\\Rishitha\\Downloads\\ML\\workout_df.csv')


In [3]:

# Function to handle missing data
def handle_missing_data(df):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    non_numeric_cols = df.select_dtypes(exclude=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    df[non_numeric_cols] = df[non_numeric_cols].fillna('Unknown')
    return df

In [4]:
# Apply missing data handling for all datasets
description_df = handle_missing_data(description_df)
diets_df = handle_missing_data(diets_df)
medication_df = handle_missing_data(medication_df)
precautions_df = handle_missing_data(precautions_df)
symptom_severity_df = handle_missing_data(symptom_severity_df)
symptoms_df = handle_missing_data(symptoms_df)
training_df = handle_missing_data(training_df)
workout_df = handle_missing_data(workout_df)


In [5]:
# Dropping duplicates for all datasets
description_df.drop_duplicates(inplace=True)
diets_df.drop_duplicates(inplace=True)
medication_df.drop_duplicates(inplace=True)
precautions_df.drop_duplicates(inplace=True)
symptom_severity_df.drop_duplicates(inplace=True)
symptoms_df.drop_duplicates(inplace=True)
training_df.drop_duplicates(inplace=True)
workout_df.drop_duplicates(inplace=True)


In [6]:
# Check for missing values
print("Missing values in Training Data:")
print(training_df.isnull().sum())

# Fill or drop missing values as appropriate
training_df = training_df.dropna()  # Example: Dropping rows with missing values

# Check for duplicates
print("Duplicate rows in Training Data:")
print(training_df.duplicated().sum())

# Drop duplicates if any
training_df = training_df.drop_duplicates()

Missing values in Training Data:
itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64
Duplicate rows in Training Data:
0


In [7]:
# Inspect the dataset
print(training_df.columns)
print(training_df.head())

# Check for missing values
print("Missing values in Training Data:")
print(training_df.isnull().sum())

# Fill or drop missing values (example: drop missing values)
training_df = training_df.dropna()

# Check for duplicates
print("Duplicate rows in Training Data:")
display(training_df.duplicated().sum())

# Drop duplicates if any
training_df = training_df.drop_duplicates()

# Encode target variable
le = LabelEncoder()
training_df['prognosis'] = le.fit_transform(training_df['prognosis'])

# Extract features and target variable
X = training_df.drop(columns=['prognosis'])
y = training_df['prognosis']

Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'blackheads', 'scurring', 'skin_peeling', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails', 'blister',
       'red_sore_around_nose', 'yellow_crust_ooze', 'prognosis'],
      dtype='object', length=133)
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0          

0

In [8]:
y.head()

0    15
1    15
2    15
3    15
4    15
Name: prognosis, dtype: int32

In [9]:
# Split the data into training and test sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

In [10]:
models_with_params = {
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [50, 100, 150],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10]
    }),
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs']
    }),
    'XGBoost': (XGBClassifier(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    }),
    'Neural Network': (MLPClassifier(max_iter=1000), {
        'hidden_layer_sizes': [(50,), (100,), (150,)],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd'],
    })
}


In [11]:
# Function for tuning and training
def tune_and_train_models(models_with_params, X_train, y_train):
    best_estimators = {}
    for model_name, (model, params) in models_with_params.items():
        print(f"Hyperparameter tuning for {model_name}...")
        grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_estimators[model_name] = grid_search.best_estimator_
        print(f"Best Params for {model_name}: {grid_search.best_params_}")
        print(f"Best CV Accuracy for {model_name}: {grid_search.best_score_:.4f}")
    return best_estimators

# Tune and get the best models
best_estimators = tune_and_train_models(models_with_params, X_train, y_train)


Hyperparameter tuning for Random Forest...




Best Params for Random Forest: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Best CV Accuracy for Random Forest: 1.0000
Hyperparameter tuning for Logistic Regression...
Best Params for Logistic Regression: {'C': 1, 'solver': 'liblinear'}
Best CV Accuracy for Logistic Regression: 1.0000
Hyperparameter tuning for XGBoost...




Best Params for XGBoost: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}
Best CV Accuracy for XGBoost: 0.9104
Hyperparameter tuning for Neural Network...




Best Params for Neural Network: {'activation': 'tanh', 'hidden_layer_sizes': (50,), 'solver': 'adam'}
Best CV Accuracy for Neural Network: 1.0000


In [12]:
# Function to evaluate the models on the test set
def evaluate_models(models, X_test, y_test):
    results = {}
    for model_name, model in models.items():
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred)
        results[model_name] = {'Accuracy': accuracy, 'F1 Score': f1, 'Confusion Matrix': cm}
        print(f"\n{model_name} Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print("Confusion Matrix:")
        print(cm)
        print(classification_report(y_test, y_pred))
    return results

# Evaluate models
results = evaluate_models(best_estimators, X_test, y_test)



Random Forest Results:
Accuracy: 1.0000
F1 Score: 1.0000
Confusion Matrix:
[[3 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 3 0]
 [0 0 0 ... 0 0 2]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         4
           6       1.00      1.00      1.00         4
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         4
          10       1.00      1.00      1.00         3
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         2
          14       1.00      1.00      1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# Find and save the best model based on test accuracy
best_model_name = max(results, key=lambda x: results[x]['Accuracy'])
best_model = best_estimators[best_model_name]

# Save the best model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"Best Model: {best_model_name} with accuracy of {results[best_model_name]['Accuracy']:.4f}")

Best Model: Random Forest with accuracy of 1.0000


In [14]:
!pip install transformers
!pip install sentence-transformers



In [15]:
# Feature set (X) and target (y)
X = training_df.drop(columns=['prognosis'])
y = training_df['prognosis']

# Create the symptoms dictionary dynamically based on feature columns
symptoms_dict = {symptom.lower(): index for index, symptom in enumerate(X.columns)}

# Load trained RandomForest model (trained on the same feature set X)
with open('best_model.pkl', 'rb') as file:
    loaded_rfc = pickle.load(file)

In [16]:
y.head()

0    15
1    15
2    15
3    15
4    15
Name: prognosis, dtype: int32

In [17]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np

  from tqdm.autonotebook import tqdm, trange





In [18]:
pip install pyspellchecker

Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install fuzzywuzzy python-Levenshtein textblob

Note: you may need to restart the kernel to use updated packages.


In [20]:
symptom_embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [21]:
# Initialize the conversational agent (GPT-2 or GPT-3)
llm = pipeline('text-generation', model='gpt2')  # Adjust model as needed (e.g., 'gpt3' if using OpenAI API)

In [22]:
user_input = input("Please describe your symptoms: ")

In [None]:
import re
import numpy as np
from fuzzywuzzy import process
from textblob import TextBlob
from sklearn.metrics.pairwise import cosine_similarity

# Function for matching user symptoms with valid symptoms
def embedding_similarity_symptoms(user_input, valid_symptoms, valid_symptom_embeddings, threshold=0.4):
    # Split user input by commas and "and"
    user_symptoms = [sym.strip() for sym in re.split(r',| and ', user_input.lower())]

    corrected_symptoms = []

    for user_symptom in user_symptoms:
        # Correct spelling using TextBlob
        corrected_user_symptom = str(TextBlob(user_symptom).correct())

        # Get embedding for the corrected user input symptom
        user_symptom_embedding = symptom_embedding_model.encode([corrected_user_symptom])

        # Calculate cosine similarity with valid symptom embeddings
        cosine_similarities = cosine_similarity(user_symptom_embedding, valid_symptom_embeddings)
        max_similarity = np.max(cosine_similarities)

        if max_similarity > threshold:
            closest_match_idx = np.argmax(
                
            )
            closest_match = valid_symptoms[closest_match_idx]
            corrected_symptoms.append(closest_match)
        else:
            # Fuzzy matching for possible valid symptoms
            closest_match, score = process.extractOne(corrected_user_symptom, valid_symptoms)
            if score >= 80:  # Setting a threshold for fuzzy matching (80% similarity)
                corrected_symptoms.append(closest_match)

    return corrected_symptoms

In [None]:
#Symptom Correction Using Embeddings and Cosine Similarity
valid_symptoms = list(symptoms_dict.keys())
valid_symptom_embeddings = symptom_embedding_model.encode(valid_symptoms)
corrected_symptoms = embedding_similarity_symptoms(user_input, valid_symptoms, valid_symptom_embeddings)
print(f"Corrected Symptoms: {corrected_symptoms}")

Corrected Symptoms: ['skin_rash', 'nodal_skin_eruptions', 'itching']


In [None]:
import numpy as np

def create_input_vector(corrected_symptoms, symptoms_dict):
    input_vector = np.zeros(len(symptoms_dict))  # Initialize vector with zeros

    corrected_symptoms = list(set(corrected_symptoms))

    # Set the corresponding symptom indices to 1
    for symptom in corrected_symptoms:
        if symptom in symptoms_dict:
            input_vector[symptoms_dict[symptom]] = 1
        else:
            print(f"Warning: Symptom '{symptom}' not found in symptoms dictionary.")

    return input_vector

# Example Usage
input_vector = create_input_vector(corrected_symptoms, symptoms_dict)


In [26]:
y.head()

0    15
1    15
2    15
3    15
4    15
Name: prognosis, dtype: int32

In [27]:
def predict_disease(input_vector, model):
    predicted_disease = model.predict([input_vector])[0]
    return predicted_disease

In [None]:
predicted_disease = predict_disease(input_vector, loaded_rfc)

predicted_disease_name = le.inverse_transform([predicted_disease])[0]

print(f"Predicted Disease: {predicted_disease_name}")


Predicted Disease: Fungal infection




In [None]:
# Function to fetch disease-related information from the datasets
def get_disease_info(disease, description_df, medication_df, diets_df, precautions_df, workout_df):
    # Convert the disease name to lowercase for comparison
    disease = disease.lower()

    description = description_df[description_df['Disease'].str.lower() == disease]['Description'].values
    if len(description) > 0:
        description = description[0]
    else:
        description = "Information about this disease is currently unavailable."


    medications = medication_df[medication_df['Disease'].str.lower() == disease]['Medication'].values
    if len(medications) > 0:
        medications = medications[0]
    else:
        medications = "No specific medications available."

    diet = diets_df[diets_df['Disease'].str.lower() == disease]['Diet'].values
    if len(diet) > 0:
        diet = diet[0]
    else:
        diet = "No specific diet recommendations."


    precautions = precautions_df[precautions_df['Disease'].str.lower() == disease][['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']].values
    if len(precautions) > 0:
        precautions = precautions[0]  # Combine all precautions
    else:
        precautions = ["No specific precautions available."]


    workout = workout_df[workout_df['disease'].str.lower() == disease]['workout'].values
    if len(workout) > 0:
        workout = workout[0]
    else:
        workout = "No specific workout recommendations."

    return description, medications, diet, precautions, workout


description, medications, diet, precautions, workout = get_disease_info(predicted_disease_name, description_df, medication_df, diets_df, precautions_df, workout_df)

# Print the fetched disease-related information
print(f"Description: {description}")
print(f"Medications: {medications}")
print(f"Diet Recommendations: {diet}")
print(f"Precautions: {', '.join(precautions)}")
print(f"Workout Recommendations: {workout}")


Description: Fungal infection is a common skin condition caused by fungi.
Medications: ['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole']
Diet Recommendations: ['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']
Precautions: bath twice, use detol or neem in bathing water, keep infected area dry, use clean cloths
Workout Recommendations: Avoid sugary foods


In [None]:
# Fetch information for the predicted disease
description, medications, diet, precautions, workout = get_disease_info(predicted_disease_name, description_df, medication_df, diets_df, precautions_df, workout_df)

# Generate Conversational Response Using LLM

# Function for delivering diagnosis via LLM
def llm_diagnosis_response(disease, description, medications, diet, precautions, workout):
    # Check if the variables are lists or strings, and handle accordingly
    if isinstance(medications, list):
        medications = ', '.join(medications)
    if isinstance(diet, list):
        diet = ', '.join(diet)
    if isinstance(precautions, list):
        precautions = ', '.join(precautions)
    if isinstance(workout, list):
        workout = ', '.join(workout)

    # Create the conversation prompt
    conversation_prompt = (
        f"The patient has been diagnosed with {disease}. "
        f"Description: {description}. "
        f"Medications: {medications}. "
        f"Diet Recommendations: {diet}. "
        f"Precautions: {precautions}. "
        f"Workout Recommendations: {workout}."
    )

    # Generate the LLM response
    llm_response = llm(conversation_prompt, max_new_tokens=100, num_return_sequences=1)

    # Extract the generated text
    response_text = llm_response[0]['generated_text']
    return response_text.strip()

# Generate conversational response with mapped disease name
llm_response = llm_diagnosis_response(predicted_disease_name, description, medications, diet, precautions, workout)
print(f"LLM Response: {llm_response}")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


LLM Response: The patient has been diagnosed with Fungal infection. Description: Fungal infection is a common skin condition caused by fungi.. Medications: ['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole']. Diet Recommendations: ['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']. Precautions: ['bath twice' 'use detol or neem in bathing water'
 'keep infected area dry' 'use clean cloths']. Workout Recommendations: Avoid sugary foods. Avoid eating refined carbohydrates. Avoid eating raw foods like eggs. Avoid buying raw vegetables. Do not use caffeine, white sugar, or soy beverages. Use only healthy fruits and vegetables.

The patient has been diagnosed with Fungal infection. Description: Fungal infection is a common skin condition caused by fungi.. Medications: ['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']. Diet Recommendations: ['bath
