# Task for Today  

***

## Disease Prediction  

Given *data about various symptoms in patients*, let's try to predict which **disease** a given patient has.

We will use a logistic regression model to make our predictions.  
We will use the weights that the model learns as a measure of feature importance and perform feature selection on the data.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

In [None]:
data = pd.read_csv('../input/disease-prediction-using-machine-learning/Training.csv')

In [None]:
data

In [None]:
data['prognosis'].value_counts()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop missing column
    df = df.drop('Unnamed: 133', axis=1)
    
    # Split df into X and y
    y = df['prognosis']
    X = df.drop('prognosis', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

# Training (Original Data)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

print("Test Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))

# Feature Selection

In [None]:
coefficients = np.mean(model.coef_, axis=0)
coefficients

In [None]:
importance_threshold = np.quantile(np.abs(coefficients), q=0.25)

fig = px.bar(
    x=coefficients,
    y=X_train.columns,
    orientation='h',
    color=coefficients,
    color_continuous_scale=[(0, 'red'), (1, 'blue')],
    labels={'x': "Coefficient Value", 'y': "Feature"},
    title="Feature Importance From Model Weights"
)

fig.add_vline(x=importance_threshold, line_color='yellow')
fig.add_vline(x=-importance_threshold, line_color='yellow')
fig.add_vrect(x0=importance_threshold, x1=-importance_threshold, line_width=0, fillcolor='yellow', opacity=0.2)

fig.show()

Any features whose average weight values lie within the yellow box will be removed.

In [None]:
low_importance_features = X_train.columns[np.abs(coefficients) < importance_threshold]
low_importance_features

# Training (Reduced Data)

In [None]:
reduced_data = data.drop(low_importance_features, axis=1).copy()

X_train, X_test, y_train, y_test = preprocess_inputs(reduced_data)

In [None]:
X_train

In [None]:
reduced_data_model = LogisticRegression()
reduced_data_model.fit(X_train, y_train)

print("Test Accuracy: {:.2f}%".format(reduced_data_model.score(X_test, y_test) * 100))

# Confusion Matrix

In [None]:
y_pred = reduced_data_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

In [None]:
plt.figure(figsize=(30, 30))
sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
plt.xticks(np.arange(41) + 0.5, reduced_data_model.classes_, rotation=90)
plt.yticks(np.arange(41) + 0.5, reduced_data_model.classes_, rotation=0)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/VqQgKfl0lPg