# **Exploratory Data Analysis (EDA)**

**1. Import Dependencies**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import metrics
from sklearn.model_selection import train_test_split


**1. Load the Dataset**

In [2]:
dataset = pd.read_csv('dataset.csv')

In [3]:
dataset

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,


In [4]:
# Types of Disease in dataset
dataset['Disease'].value_counts()

Acne                                       120
Common Cold                                120
Hypothyroidism                             120
Impetigo                                   120
Bronchial Asthma                           120
Chicken pox                                120
Dimorphic hemmorhoids(piles)               120
GERD                                       120
Heart attack                               120
Cervical spondylosis                       120
Migraine                                   120
Hepatitis E                                120
Pneumonia                                  120
Malaria                                    120
Arthritis                                  120
Chronic cholestasis                        120
hepatitis A                                120
Peptic ulcer diseae                        120
Hypoglycemia                               120
Hepatitis C                                120
Gastroenteritis                            120
Fungal infect

**2. Get the Statistical Details**

In [5]:
dataset.describe()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,4920,4920,4920,4920,4572,3714,2934,2268,1944,1692,1512,1194,744,504,306,240,192,72
unique,41,34,48,54,50,38,32,26,21,22,21,18,11,8,4,3,3,1
top,Acne,vomiting,vomiting,fatigue,high_fever,headache,nausea,abdominal_pain,abdominal_pain,yellowing_of_eyes,yellowing_of_eyes,irritability,malaise,abnormal_menstruation,chest_pain,chest_pain,blood_in_sputum,muscle_pain
freq,120,822,870,726,378,348,390,264,276,228,198,120,126,72,96,144,72,72


**3. Get information about dataset**

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
Disease       4920 non-null object
Symptom_1     4920 non-null object
Symptom_2     4920 non-null object
Symptom_3     4920 non-null object
Symptom_4     4572 non-null object
Symptom_5     3714 non-null object
Symptom_6     2934 non-null object
Symptom_7     2268 non-null object
Symptom_8     1944 non-null object
Symptom_9     1692 non-null object
Symptom_10    1512 non-null object
Symptom_11    1194 non-null object
Symptom_12    744 non-null object
Symptom_13    504 non-null object
Symptom_14    306 non-null object
Symptom_15    240 non-null object
Symptom_16    192 non-null object
Symptom_17    72 non-null object
dtypes: object(18)
memory usage: 692.0+ KB


In [51]:
dataset


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,


**4. Converting Categorical Data into Numerical using One Hot Encoding**

In [7]:
for col in dataset.columns:
    dataset[col] = dataset[col].str.replace('_',' ')

In [8]:
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


In [9]:
# Removing the white space in each cell of Dataframe
cols = dataset.columns
data = dataset[cols].values.flatten()

s = pd.Series(data)
s = s.str.strip()
s = s.values.reshape(dataset.shape)

dataset = pd.DataFrame(s,columns = cols)
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


handling the missing values   

In [10]:
dataset.fillna(0, inplace = True)

In [11]:
dataset

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,itching,skin rash,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,itching,skin rash,nodal skin eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning movements,loss of balance,unsteadiness,0,0,0,0,0,0,0,0,0,0,0
4916,Acne,skin rash,pus filled pimples,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,burning micturition,bladder discomfort,foul smell of urine,continuous feel of urine,0,0,0,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,skin rash,joint pain,skin peeling,silver like dusting,small dents in nails,inflammatory nails,0,0,0,0,0,0,0,0,0,0,0


# **Symptoms Severnity**

**1. Load the Dataset**

In [12]:
severity = pd.read_csv('Symptom-severity.csv')

In [13]:
severity.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [14]:
# remove the slashs from the dataset
severity['Symptom'] = severity['Symptom'].str.replace('_',' ')

Encoding the dataset symtoms with the weights given

In [15]:
vals = dataset.values
symptoms = severity['Symptom'].unique()

In [16]:
for i in range(len(symptoms)):
    vals[vals == symptoms[i]] = severity[severity['Symptom'] == symptoms[i]]['weight'].values[0]

convert the encoded data series into the dataframe

In [17]:
vals

array([['Fungal infection', 1, 3, ..., 0, 0, 0],
       ['Fungal infection', 3, 4, ..., 0, 0, 0],
       ['Fungal infection', 1, 4, ..., 0, 0, 0],
       ...,
       ['Urinary tract infection', 6, 4, ..., 0, 0, 0],
       ['Psoriasis', 3, 3, ..., 0, 0, 0],
       ['Impetigo', 3, 7, ..., 0, 0, 0]], dtype=object)

In [18]:
cols = dataset.columns
dataset = pd.DataFrame(vals,columns=cols)
dataset

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,5,3,5,6,4,4,0,0,0,0,0,0,0,0,0,0,0
4916,Acne,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,6,4,foul smell of urine,6,0,0,0,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,3,3,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0


Check for non-encoded cell

In [19]:
temp = []
for i in range(1,18):
  temp.append(dataset[f'Symptom_{i}'].value_counts())

In [20]:
temp

[3    1926
 4    1074
 5     972
 1     678
 6     120
 2     120
 7      30
 Name: Symptom_1, dtype: int64, 3                      1698
 5                      1500
 4                      1116
 7                       348
 2                       132
 6                       108
 foul smell of urine      18
 Name: Symptom_2, dtype: int64, 4                      1974
 3                      1014
 5                       852
 7                       378
 2                       360
 6                       222
 foul smell of urine      84
 dischromic  patches      36
 Name: Symptom_3, dtype: int64, 4                      1110
 5                       900
 3                       846
 2                       702
 7                       552
 6                       348
 0                       348
 dischromic  patches      72
 spotting  urination      42
 Name: Symptom_4, dtype: int64, 4                      1344
 0                      1206
 5                       744
 3              

In [21]:
dataset = dataset.replace('spotting  urination',0)
dataset = dataset.replace('dischromic  patches',0)
dataset = dataset.replace('foul smell of urine',0)
dataset

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,5,3,5,6,4,4,0,0,0,0,0,0,0,0,0,0,0
4916,Acne,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,6,4,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,3,3,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0


Checkt the ratio of (Symptom:Disease)

In [22]:
print("Number of symptoms used to identify the disease ",len(severity['Symptom'].unique()))
print("Number of diseases that can be identified ",len(dataset['Disease'].unique()))

Number of symptoms used to identify the disease  132
Number of diseases that can be identified  41


# **Split Dataset into training and testing dataset**

**Encoded Dataset**

In [23]:
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Split dataset into Training and Label Dataset**

In [24]:

X = dataset.iloc[:,1:].values
y = dataset['Disease'].values

In [25]:
X

array([[1, 3, 4, ..., 0, 0, 0],
       [3, 4, 0, ..., 0, 0, 0],
       [1, 4, 0, ..., 0, 0, 0],
       ...,
       [6, 4, 0, ..., 0, 0, 0],
       [3, 3, 3, ..., 0, 0, 0],
       [3, 7, 4, ..., 0, 0, 0]], dtype=int64)

In [26]:
y

array(['Fungal infection', 'Fungal infection', 'Fungal infection', ...,
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [27]:
y.dtype

dtype('O')

**Splits the 'X' set into Training and Testing set**

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [29]:
print(X_train.dtype)
print(X_test.dtype)
print(y_train.dtype)
print(y_test.dtype)


int64
int64
object
object


In [30]:
# get the set size
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(3936, 17)
(3936,)
(984, 17)
(984,)


# **Model Selection**

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [35]:
# Initialize the classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)


In [36]:
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.99
Classification Report:
                                          precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       0.92      1.00      0.96        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       1.00      1.00      1.00        33
                   Cervical spondylosis       1.00      0.87      0.93        23
                            Chicken pox       1.00      1.00      1.00        21
                    Chronic cholestasis       1.00      1.00      1.00        15
                            Common Cold       1.00      1.00      1.0

In [37]:
# Save the trained model using joblib
joblib.dump(model, 'disease_prediction_model.pkl')
print("Model saved successfully!")


Model saved successfully!


In [38]:
# Load the saved model
loaded_model = joblib.load('disease_prediction_model.pkl')

# Example Prediction
sample_symptoms = X_test[0].reshape(1, -1)
predicted_disease = loaded_model.predict(sample_symptoms)
print(f'Predicted Disease: {predicted_disease[0]}')


Predicted Disease: Acne
