In [15]:
#importing libraires

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import mode

In [16]:
# importing Dataset

data = pd.read_csv('https://github.com/ybifoundation/Dataset/raw/main/MultipleDiseasePrediction.csv')


In [17]:
# Converting prognosis column to numeric datatype using LabelEncoder

encoder = LabelEncoder()
data["prognosis"] = encoder.fit_transform(data["prognosis"])


In [18]:
# Splitting the data into X and Y

X = data.iloc[:,:-1]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size = 0.2, random_state = 24)

In [19]:
# Verify Shape

print(f"Train: {X_train.shape}, {y_train.shape}")

print(f"Test: {X_test.shape}, {y_test.shape}")

Train: (3936, 132), (3936,)
Test: (984, 132), (984,)


In [20]:
# Selection of model
rf_model = RandomForestClassifier(random_state=18)

In [21]:
# Train Model 
# Predict with model 
rf_model.fit(X_train.values, y_train)
preds = rf_model.predict(X_test.values)

print(f"Accuracy on train data by Random Forest Classifier : {accuracy_score(y_train, rf_model.predict(X_train.values))*100}")
 

print(f"Accuracy on test data by Random Forest Classifier : {accuracy_score(y_test, preds)*100}")
 

cf_matrix = confusion_matrix(y_test, preds)
cf_matrix
rf_preds = rf_model.predict(X_test.values)

Accuracy on train data by Random Forest Classifier : 100.0
Accuracy on test data by Random Forest Classifier : 100.0


In [22]:
y_pred=rf_model.predict(X_test.values)
y_pred

array([ 7, 26, 11, 21, 40, 12, 14, 30,  0, 15, 17, 12, 20, 28,  7, 18,  2,
       35, 28,  8,  0, 18,  8,  9,  9,  8, 10, 38, 13, 17, 15, 34, 36, 23,
       15, 38,  7, 38,  8, 23,  6, 10, 33, 29, 11,  6, 24, 33,  1, 29,  7,
        5, 31, 26, 23, 26,  0, 18, 14, 28, 12, 22,  0,  6,  5, 23, 20, 26,
       18, 37,  5, 14,  2, 23, 32,  2, 15, 32, 37,  0,  1,  4, 32, 38,  6,
        1, 25, 38, 30, 19, 32, 14, 11, 39,  7, 15, 40, 19, 13, 31, 19,  0,
       11, 15, 27,  6, 18, 39,  7, 27,  6, 21, 35, 38,  6, 22, 11, 40, 19,
       10, 12, 26, 10, 26, 34,  6, 35, 20,  8, 14, 17, 39,  6, 10, 11, 37,
       30, 12,  8,  2,  5,  5, 14,  2, 13,  9, 30,  1, 30, 24, 36, 25, 37,
       34, 13, 39, 11, 13,  4,  9,  3, 29, 35,  9,  7, 36,  6,  4, 36, 15,
       30, 13, 31,  1,  9, 10, 17, 32, 16, 38, 32, 20,  6, 28, 19,  1, 14,
       35, 35, 12,  1, 13,  4, 16, 19, 38, 31, 25, 16, 25, 31,  4, 21, 16,
       31, 21, 24, 33, 35, 23,  9, 40, 11, 36, 10, 23,  7, 26, 27,  7,  0,
       13, 26, 39, 12, 27

In [23]:
# Model Accuracy
accuracy_score(y_test,y_pred)

1.0

In [24]:
# Model Confusion Matrix
confusion_matrix(y_test,y_pred)

array([[32,  0,  0, ...,  0,  0,  0],
       [ 0, 21,  0, ...,  0,  0,  0],
       [ 0,  0, 20, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 26,  0,  0],
       [ 0,  0,  0, ...,  0, 18,  0],
       [ 0,  0,  0, ...,  0,  0, 22]])

In [25]:
# model classification Report

print(classification_report(y_test,y_pred,))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        23
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        29
           6       1.00      1.00      1.00        32
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        29
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        25
          11       1.00      1.00      1.00        17
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        27
          14       1.00      1.00      1.00        20
          15       1.00      1.00      1.00        25
          16       1.00      1.00      1.00        23
          17       1.00    

In [26]:
# future Prediction by user input

symptoms = X.columns.values
 
# Creating a symptom index dictionary to encode the input symptoms into numerical form

symptom_index = {}

for index, value in enumerate(symptoms):

    symptom = " ".join([i.capitalize() for i in value.split("_")])

    symptom_index[symptom] = index
data_dict = {
    "symptom_index":symptom_index,
    "predictions_classes":encoder.classes_
}
 
# Defining the Function
# Input: string containing symptoms separated by commas
# Output: Generated predictions by models

def predictDisease(symptoms):
    symptoms = symptoms.split(",")

    # creating input data for the models

    input_data = [0] * len(data_dict["symptom_index"])

    for symptom in symptoms:

        index = data_dict["symptom_index"][symptom]

        input_data[index] = 1

         

    # reshaping the input data and converting it

    # into suitable format for model predictions

    input_data = np.array(input_data).reshape(1,-1)
    rf_prediction = data_dict["predictions_classes"][rf_model.predict(input_data)[0]]
   

    predictions = {

        "Predicted Disease is ": rf_prediction,


    }

    return predictions
 
# Testing the function

print(predictDisease("Itching,Skin Rash,Fatigue,Lethargy,High Fever,Headache"))

{'Predicted Disease is ': 'Chicken pox'}
