In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mode
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
data_training = pd.read_csv('/kaggle/input/disease-prediction-using-machine-learning/Training.csv').dropna(axis=1)
data_training.head()

In [None]:
data_training.info()

In [None]:
data_training.isnull().sum()

In [None]:
disease_counts = data_training['prognosis'].value_counts()
temp_df = pd.DataFrame({
    "Disease":disease_counts.index,
    "Counts":disease_counts.values
})

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x="Disease", y="Counts", data=temp_df)
plt.xticks(rotation=90)
plt.show()

In [None]:
encoder = LabelEncoder()
data_training['prognosis'] = encoder.fit_transform(data_training["prognosis"])

In [None]:
x = data_training.iloc[:,:-1]
y = data_training.iloc[:,-1]
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2, random_state=24)

print(f'Train: {xtrain.shape}, {ytrain.shape}')
print(f'Test: {xtest.shape}, {ytest.shape}')

In [None]:
def cv_scoring(estimator, x,y):
    return accuracy_score(y, estimator.predict(x))

models = {
    "SVC":SVC(),
    "Gaussian NB":GaussianNB(),
    "Random Forest":RandomForestClassifier(random_state = 18)
}

for model_name in models:
    model = models[model_name]
    scores = cross_val_score(model, x,y,cv=10, n_jobs=-1, scoring=cv_scoring)
    print("=="*30)
    print(model_name)
    print(f"Score: {scores}")
    print(f"Mean Score: {np.mean(scores)}")

In [None]:
svm_model = SVC()
svm_model.fit(xtrain, ytrain)
preds = svm_model.predict(xtest)

print(f"accuracy (train) of SVM classifier:{accuracy_score(ytrain, svm_model.predict(xtrain))}")
print(f"Acuuracy (test) of svm classifier:{accuracy_score(ytest, preds)}")
cf_matrix = confusion_matrix(ytest, preds)
plt.figure(figsize=(12,8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confussion Matrix for SVM classifier on test")
plt.show()

In [None]:
nb_model = GaussianNB()
nb_model.fit(xtrain, ytrain)
preds = nb_model.predict(xtest)

print(f"accuracy (train) of NB classifier:{accuracy_score(ytrain, nb_model.predict(xtrain))}")
print(f"Acuuracy (test) of NB classifier:{accuracy_score(ytest, preds)}")
cf_matrix = confusion_matrix(ytest, preds)
plt.figure(figsize=(12,8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confussion Matrix for NB classifier on test")
plt.show()

In [None]:
rf_model = RandomForestClassifier(random_state=18)
rf_model.fit(xtrain, ytrain)
preds = rf_model.predict(xtest)

print(f"accuracy (train) of RF classifier:{accuracy_score(ytrain, rf_model.predict(xtrain))}")
print(f"Acuuracy (test) of RF classifier:{accuracy_score(ytest, preds)}")
cf_matrix = confusion_matrix(ytest, preds)
plt.figure(figsize=(12,8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confussion Matrix for RF classifier on test")
plt.show()

In [None]:
final_svm_model = SVC()
final_nb_model = GaussianNB()
final_rf_model = RandomForestClassifier(random_state=18)
final_svm_model.fit(x,y)
final_nb_model.fit(x,y)
final_rf_model.fit(x,y)

In [None]:
data_test = pd.read_csv('/kaggle/input/disease-prediction-using-machine-learning/Testing.csv').dropna(axis=1)

In [None]:
testx = data_test.iloc[:,:-1]
testy = encoder.transform(data_test.iloc[:, -1])

In [None]:
svm_pred = final_svm_model.predict(testx)
nb_pred = final_nb_model.predict(testx)
rf_pred = final_rf_model.predict(testx)

In [None]:
print(svm_pred)