# Disease Forecasting
This script is designed to forecast diseases based on symptoms using various machine learning algorithms.

The script includes:
- Data loading and preprocessing
- Model training using different algorithms
- Cross-validation and performance metrics calculation
- Prediction of diseases based on user-input symptoms


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from collections import Counter

In [2]:
s_data = pd.read_csv("/content/Original_Dataset.csv")

In [3]:
s_data.head().T

Unnamed: 0,0,1,2,3,4
Disease,Fungal infection,Fungal infection,Fungal infection,Fungal infection,Fungal infection
Symptom_1,itching,skin_rash,itching,itching,itching
Symptom_2,skin_rash,nodal_skin_eruptions,nodal_skin_eruptions,skin_rash,skin_rash
Symptom_3,nodal_skin_eruptions,dischromic _patches,dischromic _patches,dischromic _patches,nodal_skin_eruptions
Symptom_4,dischromic _patches,,,,
Symptom_5,,,,,
Symptom_6,,,,,
Symptom_7,,,,,
Symptom_8,,,,,
Symptom_9,,,,,


In [4]:
s_data.shape

(4920, 18)

In [5]:
col_names = [col for col in s_data.columns if col != 'Disease']

In [6]:
col_names

['Symptom_1',
 'Symptom_2',
 'Symptom_3',
 'Symptom_4',
 'Symptom_5',
 'Symptom_6',
 'Symptom_7',
 'Symptom_8',
 'Symptom_9',
 'Symptom_10',
 'Symptom_11',
 'Symptom_12',
 'Symptom_13',
 'Symptom_14',
 'Symptom_15',
 'Symptom_16',
 'Symptom_17']

In [7]:
columns_to_check = []
for col in s_data.columns:
    if col != 'Disease':
        columns_to_check.append(col)

In [8]:
symptoms = s_data.iloc[:, 1:].values.flatten()
symptoms = list(set(symptoms))

In [9]:
new_s_data = s_data.copy()  # Create a copy of the original DataFrame

for symptom in symptoms:
    new_s_data[symptom] = new_s_data.iloc[:, 1:].apply(lambda row: int(symptom in row.values), axis=1)

new_s_data = new_s_data.drop(columns=columns_to_check)

In [10]:
new_s_data

Unnamed: 0,Disease,internal_itching,prominent_veins_on_calf,mild_fever,loss_of_balance,drying_and_tingling_lips,coma,congestion,scurring,increased_appetite,...,back_pain,nodal_skin_eruptions,skin_rash,swelling_of_stomach,watering_from_eyes,fluid_overload,brittle_nails,unsteadiness,yellow_urine,silver_like_dusting
0,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
1,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4916,Acne,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4917,Urinary tract infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [11]:
s_data_v1 = new_s_data.loc[:, new_s_data.columns.notna()]

In [12]:
s_data_v1.columns = s_data_v1.columns.str.strip()

In [13]:
s_data_v1.columns

Index(['Disease', 'internal_itching', 'prominent_veins_on_calf', 'mild_fever',
       'loss_of_balance', 'drying_and_tingling_lips', 'coma', 'congestion',
       'scurring', 'increased_appetite',
       ...
       'back_pain', 'nodal_skin_eruptions', 'skin_rash', 'swelling_of_stomach',
       'watering_from_eyes', 'fluid_overload', 'brittle_nails', 'unsteadiness',
       'yellow_urine', 'silver_like_dusting'],
      dtype='object', length=132)

In [14]:
s_data_v1.head()

Unnamed: 0,Disease,internal_itching,prominent_veins_on_calf,mild_fever,loss_of_balance,drying_and_tingling_lips,coma,congestion,scurring,increased_appetite,...,back_pain,nodal_skin_eruptions,skin_rash,swelling_of_stomach,watering_from_eyes,fluid_overload,brittle_nails,unsteadiness,yellow_urine,silver_like_dusting
0,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
1,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0


In [15]:
target_variable = ['Disease']
label = LabelEncoder()
for i in target_variable:
    s_data_v1[i] = label.fit_transform(s_data_v1[i])

In [16]:
train_data = s_data_v1.drop(columns="Disease")
target_data = s_data_v1['Disease']

In [17]:
train_data.head()

Unnamed: 0,internal_itching,prominent_veins_on_calf,mild_fever,loss_of_balance,drying_and_tingling_lips,coma,congestion,scurring,increased_appetite,weight_loss,...,back_pain,nodal_skin_eruptions,skin_rash,swelling_of_stomach,watering_from_eyes,fluid_overload,brittle_nails,unsteadiness,yellow_urine,silver_like_dusting
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0


In [18]:
!pip install catboost
!pip install xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier  # Import XGBoost classifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

algorithms = {
    'Logistic Regression': {"model": LogisticRegression()},
    'Random Forest': {"model": RandomForestClassifier()},
    'xGBoost': {"model": XGBClassifier()},

}

# Perform 5-fold cross-validation with specified metrics
for model_name, values in algorithms.items():
    model = values["model"]
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    accuracy_scores = []
    f1_scores = []
    recall_scores = []
    precision_scores = []

    for train_index, test_index in kf.split(train_data, target_data):
        X_train, X_test = train_data.iloc[train_index], train_data.iloc[test_index]
        y_train, y_test = target_data.iloc[train_index], target_data.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        proba = model.predict_proba(X_test)[:, 1]


        accuracy_scores.append(accuracy)
        f1_scores.append(f1)
        recall_scores.append(recall)
        precision_scores.append(precision)


    print(f"{model_name} Results:")
    print(f"Accuracy: {np.mean(accuracy_scores):.3f}")
    print(f"F1 Score: {np.mean(f1_scores):.3f}")
    print(f"Recall: {np.mean(recall_scores):.3f}")
    print(f"Precision: {np.mean(precision_scores):.3f}")



Logistic Regression Results:
Accuracy: 1.000
F1 Score: 1.000
Recall: 1.000
Precision: 1.000
Random Forest Results:
Accuracy: 1.000
F1 Score: 1.000
Recall: 1.000
Precision: 1.000
xGBoost Results:
Accuracy: 1.000
F1 Score: 1.000
Recall: 1.000
Precision: 1.000


In [19]:
doc_data = pd.read_csv("Doctor_Versus_Disease.csv",encoding='latin1', names=['Disease','Specialist'])

In [20]:
des_data = pd.read_csv("Disease_Description.csv")

In [21]:
data = pd.read_csv("/content/Symptom_Weights.csv")

In [22]:
test_col = []
for col in s_data_v1.columns:
    if col != 'Disease':
        test_col.append(col)


test_data = {}
symptoms = []
predicted = []
def test_input():
    symptoms.clear()
    predicted.clear()
    num_inputs = int(input("Enter the number of symptoms you have: "))
    for i in range(num_inputs):
        user_input = input("Enter Symptoms #{}: ".format(i+1))
        symptoms.append(user_input)
    print("Symptoms you have:", symptoms)
    for column in test_col:
        test_data[column] = 1 if column in symptoms else 0
        test_df = pd.DataFrame(test_data, index=[0])
    print("Prediction in progress")
    for model_name, values in algorithms.items():
        predict_disease = values["model"].predict(test_df)
        predict_disease = label.inverse_transform(predict_disease)
        predicted.extend(predict_disease)
    disease_counts = Counter(predicted)
    percentage_per_disease = {disease: (count / 6) * 100 for disease, count in disease_counts.items()}
    result_df = pd.DataFrame({"Disease": list(percentage_per_disease.keys()),
                               "Chances": list(percentage_per_disease.values())})
    result_df = result_df.merge(doc_data, on='Disease', how='left')
    result_df = result_df.merge(des_data, on='Disease', how='left')
    return result_df

In [24]:
test_input()

Enter the number of symptoms you have: 2
Enter Symptoms #1: itching
Enter Symptoms #2: fungal infection
Symptoms you have: ['itching', 'fungal infection']
Prediction in progress


Unnamed: 0,Disease,Chances,Specialist,Description
0,Fungal infection,50.0,Dermatologist,"In humans, fungal infections occur when an inv..."
