# Task for Today  

***

## Hepatitis Survival Classification  

Given *medical hepatitis patient data*, let's try to predict whether a given patient will **survive** or not.

We will cluster the data using k-means and use a random forest classification model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

In [None]:
data = pd.read_csv('../input/hepatitis-data/hepatitis_csv.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df, drop_protime=False):
    df = df.copy()
    
    # Identify the continuous numeric features
    continuous_features = ['age', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime']
    
    # Fill missing values
    for column in continuous_features:
        df[column] = df[column].fillna(df[column].mean())
    
    for column in df.columns.drop(continuous_features):
        df[column] = df[column].fillna(df[column].mode().sample(1, random_state=1).values[0])
    
    # Convert the booleans columns into integer columns
        for column in df.select_dtypes('bool'):
            df[column] = df[column].astype(np.int)
    
    # Encode the sex column as a binary feature
    df['sex'] = df['sex'].replace({
        'female': 0,
        'male': 1
    })
    
    # Shuffle the data
    df = df.sample(frac=1.0, random_state=1).reset_index(drop=True)
    
    # Change label name
    df = df.rename(columns={'class': 'label'})
    
    # Drop protime
    if drop_protime == True:
        df = df.drop('protime', axis=1)
    
    # Split df into X and y
    y = df['label']
    X = df.drop('label', axis=1)
    
    return X, y

In [None]:
X, y = preprocess_inputs(data, drop_protime=True)

In [None]:
X

In [None]:
y

# Clustering

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)

cluster_labels = kmeans.labels_
cluster_labels

In [None]:
pca = PCA(n_components=2)

X_reduced = pd.DataFrame(pca.fit_transform(X), index=X.index, columns=["PC1", "PC2"])
X_reduced = pd.concat([X_reduced, y, pd.Series(cluster_labels, name='cluster')], axis=1)

centroids = pca.transform(kmeans.cluster_centers_)

In [None]:
X_reduced

In [None]:
cluster_0_examples = X_reduced.query("cluster == 0")
cluster_1_examples = X_reduced.query("cluster == 1")

plt.figure(figsize=(16, 10))
plt.scatter(cluster_0_examples['PC1'], cluster_0_examples['PC2'], label="Cluster A")
plt.scatter(cluster_1_examples['PC1'], cluster_1_examples['PC2'], label="Cluster B")
plt.scatter(centroids[:, 0], centroids[:, 1], c='lightgreen', s=200, label="Cluster Centers")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("K-Means Clustering")
plt.legend()
plt.show()

In [None]:
live_examples = X_reduced.query("label == 'live'")
die_examples = X_reduced.query("label == 'die'")


plt.figure(figsize=(16, 10))
plt.scatter(live_examples['PC1'], live_examples['PC2'], c='pink', label="Live")
plt.scatter(die_examples['PC1'], die_examples['PC2'], c='purple', label="Die")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Class Visualization")
plt.legend()
plt.show()

# Training

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

print("Test Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/rWZo8FrIlUQ