## Import Dependencies


In [21]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

import io
import requests

### Obtain Training and Testing CSV

In [54]:
# Make sure to copy full file path of the CSV files

training = pd.read_csv(r'/Users/ericsun02/Documents/UCD Classes/23-24/Winter Q/141C/Final_Proj/Disease-Prediction/Training.csv')
test = pd.read_csv(r'/Users/ericsun02/Documents/UCD Classes/23-24/Winter Q/141C/Final_Proj/Disease-Prediction/Testing.csv')

In [55]:
# Classes to Predict

len(training['prognosis'].unique())

41

### Check for NAs, Missing Data, Summaries

In [56]:
training.describe()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,Unnamed: 133
count,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,...,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,0.0
mean,0.137805,0.159756,0.021951,0.045122,0.021951,0.162195,0.139024,0.045122,0.045122,0.021951,...,0.021951,0.021951,0.023171,0.023171,0.023171,0.023171,0.023171,0.023171,0.023171,
std,0.34473,0.366417,0.146539,0.207593,0.146539,0.368667,0.346007,0.207593,0.207593,0.146539,...,0.146539,0.146539,0.150461,0.150461,0.150461,0.150461,0.150461,0.150461,0.150461,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,


In [57]:
# Check for Null Values in Training Set

training.isnull().sum()

itching                    0
skin_rash                  0
nodal_skin_eruptions       0
continuous_sneezing        0
shivering                  0
                        ... 
blister                    0
red_sore_around_nose       0
yellow_crust_ooze          0
prognosis                  0
Unnamed: 133            4920
Length: 134, dtype: int64

In [58]:
# Check for Null Values in Test Set

test.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

### Clean Data for Feature Selection

In [None]:
training = training.drop(columns = "Unnamed: 133")

In [44]:
training.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

### Feature Selection

In [None]:
###

In [None]:
###

### Splitting Train and Test

In [49]:
x_train = training.drop(['prognosis'], axis = 1)
y_train = training['prognosis']
x_test = test.drop(['prognosis'], axis = 1)
y_test = test['prognosis']

In [59]:
print(x_train)

      itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  \
0           1          1                     1                    0   
1           0          1                     1                    0   
2           1          0                     1                    0   
3           1          1                     0                    0   
4           1          1                     1                    0   
...       ...        ...                   ...                  ...   
4915        0          0                     0                    0   
4916        0          1                     0                    0   
4917        0          0                     0                    0   
4918        0          1                     0                    0   
4919        0          1                     0                    0   

      shivering  chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  \
0             0       0           0             0        0         

In [50]:
# Second coordinate of X_train should match with X_test

print(x_train.shape)

(4920, 132)


In [51]:
print(x_test.shape)

(42, 132)


In [52]:
print(len(test['prognosis'].unique()))

41


#### Encoding Response Variable

In [60]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [71]:
y_test = le.fit_transform(y_test)

In [68]:
# Should be equal to number of response categories

len(np.unique(y_train))

41

### Random Forest Classifier

In [74]:
rf = RandomForestClassifier(n_estimators = len(x_train.columns), random_state = 1)
rf.fit(x_train, y_train)

In [76]:
# Use model to predict type of prognosis (Y) on unseen values of symptoms (X)

rf_predictions = rf.predict(x_test)

# Should be equal to 42
print(len(rf_predictions))

42


#### Metrics and Performance Evaluation

In [79]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report

mae = mean_absolute_error(y_test, rf_predictions)
mse = mean_squared_error(y_test, rf_predictions)
r2 = r2_score(y_test, rf_predictions)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Absolute Error: 0.16666666666666666
Mean Squared Error: 1.1666666666666667
R-squared: 0.9914995559777783


In [81]:
report = classification_report(y_test, rf_predictions, output_dict=True)
pd.DataFrame(report).T

Unnamed: 0,precision,recall,f1-score,support
0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0
8,0.5,1.0,0.666667,1.0
9,1.0,1.0,1.0,1.0
