# Loading Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
training = pd.read_csv("../input/disease-prediction-using-machine-learning/Training.csv")
training

In [None]:
training.info()

## Droping unnecessary column

In [None]:
training.drop('Unnamed: 133', axis=1, inplace=True)
training.columns

### Counts of prognosis in data

In [None]:
len(training['prognosis'].value_counts())

There are 41 different prognonsis which means our labels in data. To see what these are:

In [None]:
training['prognosis'].value_counts()

We have the same count of label samples for each prognosis. Now see unique features in other columns:

In [None]:
for column in training.columns[:-1]:
    print("{} ({}) : {}".format(column, len(training[column].unique()), training[column].unique()))

In [None]:
testing = pd.read_csv("../input/disease-prediction-using-machine-learning/Testing.csv")
testing.head()

In [None]:
testing.shape

In [None]:
len(testing['prognosis'].unique())

# Train Test Split

In [None]:
X_train = training.drop('prognosis', axis=1)
y_train = training['prognosis']
y_train = np.array(y_train).reshape(y_train.shape[0], 1)

X_test = testing.drop('prognosis', axis=1)
y_test = testing['prognosis']
y_test = np.array(y_test).reshape(y_test.shape[0], 1)

print("X_train : {} \ny_train : {} \nX_test: {} \ny_test : {}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,accuracy_score

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

pred = tree.predict(X_test)
acc = tree.score(X_test, y_test)

print("Acurray on test set: {:.2f}%".format(acc*100))

Comparision of predictions for first 10 test data:

In [None]:
np.ravel(y_test[:10])

In [None]:
np.ravel(pred[:10])

In [None]:
print(classification_report(y_test, pred))

## Important Features

In [None]:
fi = pd.DataFrame(tree.feature_importances_*100, X_train.columns, columns=['Importance'])
fi.sort_values(by='Importance',ascending=False, inplace=True)
fi

In [None]:
zeros = np.array(fi[fi['Importance'] == 0.000000].index)
zeros

In [None]:
zeros.shape

In [None]:
print("Count of Features have 0% importance: {}".format(len(zeros)))

I'm deleting these features having 0% importance for tree to decision and creating new dataframe. I will use new dataframe for a neural network.

# New Dataframe

We had 133 columns and with deleting 69, new dataframe has to consist of 64 columns.

In [None]:
training_new = training.drop(columns=zeros, axis=1)
training_new.shape[1]

I'm doing same for test set.

In [None]:
testing_new = testing.drop(columns=zeros, axis=1)
testing_new.shape[1]

# Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

In [None]:
X_train_new = training_new.drop('prognosis', axis=1)
y_train_new = training['prognosis']

X_test_new = testing_new.drop('prognosis', axis=1)
y_test_new = testing['prognosis']

In [None]:
print("X_train_new : {} \ny_train_new : {} \nX_test_new: {} \ny_test_new : {}".format(X_train_new.shape, y_train_new.shape, X_test_new.shape, y_test_new.shape))

In [None]:
y_train_enc = pd.get_dummies(y_train_new)
y_test_enc = pd.get_dummies(y_test_new)
y_train_enc.head(10)

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=X_train_new.shape[1]))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(y_train_enc.shape[1]))
model.add(Activation('softmax'))
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',metrics=['accuracy'], optimizer='adam')

In [None]:
history = model.fit(X_train_new, y_train_enc, batch_size=120, epochs=30, validation_split=0.3)

# Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(17,7))

ax1.plot(history.history['accuracy'])
ax1.plot(history.history['val_accuracy'])
ax1.set_title('Accuracy')
ax1.set_ylabel('accuracy')
ax1.set_xlabel('epochs')
plt.legend(['train', 'val'], loc='lower right')

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_title('Loss')
ax2.set_ylabel('loss')
ax2.set_xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')

In [None]:
score = model.evaluate(X_test_new, y_test_enc, steps=5, batch_size=1, verbose=1)

print('Loss: ', score[0])
print('Accuracy: ', score[1]*100)