In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('../data/symbipredict_2022.csv')
display(df.head())

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection


In [3]:
# Examine the dataset's shape
print("Dataset Shape:", df.shape)

# Inspect the data types of each column
print("\nData Types of Columns:")
print(df.dtypes)

# Identify the target variable
target_variable = 'prognosis'
print("\nTarget Variable:", target_variable)

# Analyze the distribution of symptoms
print("\nDistribution of Symptoms (first 5 columns):")
for column in df.columns[:5]:
  print(f"{column}: {df[column].value_counts()}")

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Summarize findings
print("\nSummary:")
print("The dataset has", df.shape[0], "rows and", df.shape[1], "columns.")
print("The target variable is 'prognosis', which represents the disease.")
print("The symptoms are represented by binary features (0 or 1), indicating the presence or absence of a symptom.")
print("The distribution of symptoms can be further analyzed using visualizations or statistical measures.")
print("There are no missing values in the dataset.")

Dataset Shape: (4961, 133)

Data Types of Columns:
itching                  int64
skin_rash                int64
nodal_skin_eruptions     int64
continuous_sneezing      int64
shivering                int64
                         ...  
inflammatory_nails       int64
blister                  int64
red_sore_around_nose     int64
yellow_crust_ooze        int64
prognosis               object
Length: 133, dtype: object

Target Variable: prognosis

Distribution of Symptoms (first 5 columns):
itching: itching
0    4277
1     684
Name: count, dtype: int64
skin_rash: skin_rash
0    4168
1     793
Name: count, dtype: int64
nodal_skin_eruptions: nodal_skin_eruptions
0    4852
1     109
Name: count, dtype: int64
continuous_sneezing: continuous_sneezing
0    4737
1     224
Name: count, dtype: int64
shivering: shivering
0    4852
1     109
Name: count, dtype: int64

Missing Values:
itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering     

In [4]:
# Identify categorical columns (symptoms and prognosis)
categorical_columns = df.columns.tolist()

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_columns)

# Display the first few rows of the encoded DataFrame
display(df_encoded.head())

Unnamed: 0,itching_0,itching_1,skin_rash_0,skin_rash_1,nodal_skin_eruptions_0,nodal_skin_eruptions_1,continuous_sneezing_0,continuous_sneezing_1,shivering_0,shivering_1,...,prognosis_Osteoarthritis,prognosis_Paralysis (brain hemorrhage),prognosis_Peptic Ulcer Disease,prognosis_Pneumonia,prognosis_Psoriasis,prognosis_Tuberculosis,prognosis_Typhoid,prognosis_Urinary Tract Infection,prognosis_Varicose Veins,prognosis_Vertigo
0,False,True,False,True,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,True,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,True,False,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,False,True,False,True,True,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,False,True,False,True,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
# Remove original categorical columns
for column in categorical_columns:
  if column in df_encoded.columns:
    df_encoded = df_encoded.drop(columns=[column])
display(df_encoded.head())


Unnamed: 0,itching_0,itching_1,skin_rash_0,skin_rash_1,nodal_skin_eruptions_0,nodal_skin_eruptions_1,continuous_sneezing_0,continuous_sneezing_1,shivering_0,shivering_1,...,prognosis_Osteoarthritis,prognosis_Paralysis (brain hemorrhage),prognosis_Peptic Ulcer Disease,prognosis_Pneumonia,prognosis_Psoriasis,prognosis_Tuberculosis,prognosis_Typhoid,prognosis_Urinary Tract Infection,prognosis_Varicose Veins,prognosis_Vertigo
0,False,True,False,True,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,True,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,True,False,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,False,True,False,True,True,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,False,True,False,True,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
# Create a new DataFrame X containing only the feature columns (symptoms)
X = df_encoded.loc[:, ~df_encoded.columns.str.startswith('prognosis')]

# Create a new Series y containing only the target variable ('prognosis')
y_columns = [col for col in df_encoded.columns if col.startswith('prognosis')]
y = df_encoded[y_columns]

# Convert y to a single column with the disease name
y = y.idxmax(axis=1)


In [7]:
from sklearn.model_selection import train_test_split

# Split the data into training and a temporary set (containing testing and validation)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary set into testing and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier object
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# Fit the model to the training data
rf_classifier.fit(X_train, y_train)

In [9]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print a classification report for more detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9502688172043011

Classification Report:
                                          precision    recall  f1-score   support

                         prognosis_AIDS       1.00      1.00      1.00        24
                         prognosis_Acne       1.00      1.00      1.00        17
          prognosis_Alcoholic Hepatitis       1.00      1.00      1.00        23
                      prognosis_Allergy       1.00      1.00      1.00        13
                    prognosis_Arthritis       1.00      1.00      1.00        21
             prognosis_Bronchial Asthma       1.00      1.00      1.00        21
         prognosis_Cervical Spondylosis       1.00      1.00      1.00        12
                   prognosis_Chickenpox       1.00      1.00      1.00        16
          prognosis_Chronic Cholestasis       0.30      1.00      0.46        16
                  prognosis_Common Cold       1.00      1.00      1.00        15
                       prognosis_Dengue       1.00    

In [10]:
import pickle

# Save the trained model to a file
filename = 'trainmodel.pkl'
pickle.dump(rf_classifier, open(filename, 'wb'))

In [11]:
filename = 'trainmodel.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
symptoms = input("Enter symptoms separated by commas: ")
symptoms_list = symptoms.split(',')

In [None]:
# Get all symptom columns from the original dataframe
all_symptom_columns = df.columns.tolist()[:-1]  # Exclude the 'prognosis' column

# Create an empty dictionary to store symptom presence/absence
input_data = {}
for symptom_column in all_symptom_columns:
    input_data[symptom_column] = [1 if symptom_column in symptoms_list else 0]

# Create a DataFrame from the input data
input_df = pd.DataFrame(input_data)

# Perform one-hot encoding on the input DataFrame
input_df_encoded = pd.get_dummies(input_df, columns=all_symptom_columns)

# Align columns with the training data
missing_cols = set(X_train.columns) - set(input_df_encoded.columns)
for col in missing_cols:
    input_df_encoded[col] = 0
input_df_encoded = input_df_encoded[X_train.columns]

In [None]:
prediction = loaded_model.predict(input_df_encoded)
print("Predicted Disease:", prediction[0])

Predicted Disease: prognosis_Malaria


In [None]:
# ... (previous code) ...

# Get prediction probabilities
probabilities = loaded_model.predict_proba(input_df_encoded)

# Get the predicted class and its probability
predicted_class = prediction[0]
confidence_score = probabilities[0][loaded_model.classes_.tolist().index(predicted_class)]

# Print the prediction and confidence score
print("Predicted Disease:", predicted_class)
print("Confidence Score:", confidence_score)

Predicted Disease: prognosis_Malaria
Confidence Score: 0.02990191140521497
