In [39]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix


In [41]:
from sklearn.svm import SVC

In [11]:
path = '../data/Training.csv'
df = pd.read_csv(path)

In [13]:
df.shape

(4920, 133)

In [14]:
df.sample(5)

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
2723,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Dengue
4403,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chicken pox
2725,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,hepatitis A
4606,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Jaundice
573,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chicken pox


In [20]:
len(df['prognosis'].unique())

41

# Splitting the data

In [23]:
dataset = df

In [25]:
X = dataset.drop('prognosis', axis = 1)
y = dataset['prognosis']

In [35]:
# encode y, which is string
le = LabelEncoder()
le.fit(y)
y = le.transform(y)

In [36]:
X_train, X_test, y_train , y_test = train_test_split(X, y, test_size = 0.3 , random_state = 20)

In [37]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((3444, 132), (1476, 132), (3444,), (1476,))


# Train some ML models

In [43]:
# create a dictionary to store models
models = {
    'SVC': SVC(kernel = 'linear'),
    'RandomForest': RandomForestClassifier(n_estimators = 100, random_state = 42),
    'KNeighbors' : KNeighborsClassifier(n_neighbors = 5),
    'MultinomialNB' : MultinomialNB(),
    'GradientBoosting' : GradientBoostingClassifier(n_estimators = 100, random_state = 42)
}

In [48]:
for model, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred) # accuracy 

    cm = confusion_matrix (y_test, y_pred) # confusiion matrix

    print(f'{model} accuracy: {accuracy}')
    print(f"{model} Confusion matrix: ")
    print(np.array2string(cm, separator = ' '))


SVC(kernel='linear') accuracy: 1.0
SVC(kernel='linear') Confusion matrix: 
[[40  0  0 ...  0  0  0]
 [ 0 43  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 34  0  0]
 [ 0  0  0 ...  0 41  0]
 [ 0  0  0 ...  0  0 31]]
RandomForestClassifier(random_state=42) accuracy: 1.0
RandomForestClassifier(random_state=42) Confusion matrix: 
[[40  0  0 ...  0  0  0]
 [ 0 43  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 34  0  0]
 [ 0  0  0 ...  0 41  0]
 [ 0  0  0 ...  0  0 31]]
KNeighborsClassifier() accuracy: 1.0
KNeighborsClassifier() Confusion matrix: 
[[40  0  0 ...  0  0  0]
 [ 0 43  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 34  0  0]
 [ 0  0  0 ...  0 41  0]
 [ 0  0  0 ...  0  0 31]]
MultinomialNB() accuracy: 1.0
MultinomialNB() Confusion matrix: 
[[40  0  0 ...  0  0  0]
 [ 0 43  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 34  0  0]
 [ 0  0  0 ...  0 41  0]
 [ 0  0  0 ...  0  0 31]]
GradientBoostingClassifier(random_state=42) 

# all models have good performance


# choose one model : SVC

In [49]:
svc = SVC(kernel = 'linear')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
accuracy

1.0