# Import Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
import seaborn as sns
import os

In [None]:
dataset = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

# Exploratory data analysis

In [None]:
dataset.head()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.shape

In [None]:
dataset.describe()

In [None]:
def prevalance(dataset, column):
    '''Check prevalance of heart diease in the dataset'''
    positives = dataset[dataset[column] == 1]
    prevalance = positives.size / dataset.size
    return prevalance

In [None]:
prevalance(dataset, 'DEATH_EVENT')

In [None]:
def columns_histogram(dataset):
    '''Plot histograms of all columns in dataset'''
    columns = dataset.columns
    plt.figure(figsize = (5,5))
    for index in columns:
        plt.hist(dataset[index], bins='auto')
        plt.title(index)
        plt.show()

In [None]:
columns_histogram(dataset)

In [None]:
# select which features to standardize for the model
standardize_features = ['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium']

# Building Linear Risk Model

Since for this part, we're only estimating the likelihood of death due to a heart diesease, we will not look at the time column. We drop the time column since this is a time-to-event data and time to death is unknown in real-world. 

In [None]:
X = dataset.drop(['DEATH_EVENT', 'time'], axis=1)
y = dataset['DEATH_EVENT']
print(f'Shape of X: {X.shape} and Y: {y.shape}')

# Train/Test split

Since the dataset is imbalanced, we use stratification to split labels equally

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, stratify=y)
print('Train data')
print(f'Shape of X_train: {X_train.shape} and Y_train: {y_train.shape}')
print('Test data')
print(f'Shape of X_test: {X_test.shape} and Y_test: {y_test.shape}')

# Standardize data features

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_std = X_train.copy()
X_test_std = X_test.copy()
X_train_std[standardize_features] = sc.fit_transform(X_train[standardize_features])
X_test_std[standardize_features] = sc.transform(X_test[standardize_features])

In [None]:
columns_histogram(X_train_std)
#X_train_std[standardize_features].hist(figsize=(14,14))
#plt.show()

In [None]:
def logistic_regression(X_train, y_train):
    '''Fit a logistic regression model to predict death event of patients'''
    from sklearn.linear_model import LogisticRegression
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model

In [None]:
logistic_regression_model = logistic_regression(X_train_std, y_train)

In [None]:
# Test model on a training example
example = 45
print(f'Actual death event of patient with featues {X_train_std.iloc[example,:]} is {y_train.iloc[example,]}')
print()
prediction = logistic_regression_model.predict(X_train_std[example:example+1])
print(f'Predicted death event of patient = {prediction}')

# Evaluate model 

In [None]:
y_pred = logistic_regression_model.predict(X_test_std)

In [None]:
y_pred.shape

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix

confusion_matrix = confusion_matrix(y_test.values, y_pred)
logistic_accuracy = accuracy_score(y_test.values, y_pred)

print(f'Confusion matrix values: {confusion_matrix}')
plot_confusion_matrix(logistic_regression_model, X_test_std, y_test)  
plt.show()  
print(f'Accuracy of logistic regression model: {logistic_accuracy}')

Earlier we saw that the classes were imbalanced and the prevalance of disease was 32%. F1 score would be an appropriate metric instead of accuracy

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test.values, y_pred))

# Evaluate using C-Index

The c-index measures the discriminatory power of a risk score.
Intuitively, a higher c-index indicates that the model's prediction is in agreement with the actual outcomes of a pair of patients.
The formula for the c-index is

$$ \mbox{cindex} = \frac{\mbox{concordant} + 0.5 \times \mbox{ties}}{\mbox{permissible}} $$
 
A permissible pair is a pair of patients who have different outcomes.
A concordant pair is a permissible pair in which the patient with the higher risk score also has the worse outcome.
A tie is a permissible pair where the patients have the same risk score.

* `y_true` is the array of actual patient outcomes, 0 if the patient does not eventually get the disease, and 1 if the patient eventually gets the disease.
* `scores` is the risk score of each patient.  These provide relative measures of risk, so they can be any real numbers. By convention, they are always non-negative.
* Here is an example of input data and how to interpret it:
```Python
y_true = [0,1]
scores = [0.45, 1.25]
```
    * There are two patients. Index 0 of each array is associated with patient 0.  Index 1 is associated with patient 1.
    * Patient 0 does not have the disease in the future (`y_true` is 0), and based on past information, has a risk score of 0.45.
    * Patient 1 has the disease at some point in the future (`y_true` is 1), and based on past information, has a risk score of 1.25.

In [None]:
def cindex(y_true, scores):
    '''
    Input:
    y_true (np.array): a 1-D array of true binary outcomes (values of zero or one)
        0: patient does not have a death event
        1: patient has a death event
    scores (np.array): a 1-D array of corresponding death scores output by the model

    Output:
    c_index (float): (concordant pairs + 0.5*ties) / number of permissible pairs
    '''
    n = len(y_true)
    assert len(scores) == n

    concordant = 0
    permissible = 0
    ties = 0
    
    for i in range(n):
        for j in range(i+1, n): 
            
            # Check if the pair is permissible (the patient outcomes are different)
            if y_true[i] != y_true[j]:
                # Count the pair if it's permissible
                permissible += 1

                # For permissible pairs, check if they are concordant or are ties
                # check for ties in the score
                if scores[i] == scores[j]:
                    # count the tie
                    ties += 1
                    # if it's a tie, we don't need to check patient outcomes, continue to the top of the for loop.
                    continue

                # case 1: patient i doesn't get the disease, patient j does
                if y_true[i] == 0 and y_true[j] == 1:
                    # Check if patient i has a lower risk score than patient j
                    if scores[i] < scores[j]:
                        # count the concordant pair
                        concordant += 1
                    # Otherwise if patient i has a higher risk score, it's not a concordant pair.
                    # Already checked for ties earlier

                # case 2: patient i gets the disease, patient j does not
                if y_true[i] == 1 and y_true[j] == 0:
                    # Check if patient i has a higher risk score than patient j
                    if scores[i] > scores[j]:
                        #count the concordant pair
                        concordant += 1
                    # Otherwise if patient i has a lower risk score, it's not a concordant pair.
                    # We already checked for ties earlier

    # calculate the c-index using the count of permissible pairs, concordant pairs, and tied pairs.
    c_index = (concordant + 0.5 * ties) / permissible    
    return c_index

In [None]:
scores = logistic_regression_model.predict_proba(X_test_std)[:, 1]
c_index_X_test = cindex(y_test.values, scores)
print(f"c-index on test set is {c_index_X_test:.4f}")

Check which features are having the most effect

In [None]:
coeffs = pd.DataFrame(data = logistic_regression_model.coef_, columns = X_train_std.columns)
coeffs.T.plot.bar(legend=None)

# Add interaction terms

* An interaction term is the product of two variables. 
    * For example, if we have data 
    $$ x = [x_1, x_2]$$
    * We could add the product so that:
    $$ \hat{x} = [x_1, x_2, x_1*x_2]$$
    

In [None]:
def add_interactions(X):
    """Add interaction terms between columns to dataframe."""
    features = X.columns
    m = len(features)
    X_int = X.copy(deep=True)

    for i in range(m):
        feature_i_name = features[i]
        feature_i_data = X_int[feature_i_name]
        
        # choose the index of column 'j' to be greater than column i
        for j in range(i+1, m):
            
            feature_j_name = features[j]
            feature_j_data = X_int[feature_j_name]
            
            feature_i_j_name = f"{feature_i_name}_x_{feature_j_name}"
            X_int[feature_i_j_name] = feature_i_data * feature_j_data
            
    return X_int

In [None]:
X_train_int = add_interactions(X_train_std)
X_test_int = add_interactions(X_test_std)

In [None]:
X_train_int.head()

In [None]:
logistic_reg_model_interaction = logistic_regression(X_train_int, y_train)

# Evaluate logistic model with feature interactions

In [None]:
y_pred_int = logistic_reg_model_interaction.predict(X_test_int)

In [None]:
plot_confusion_matrix(logistic_reg_model_interaction, X_test_int, y_test)  
plt.show()  
logistic_accuracy_int = accuracy_score(y_test.values, y_pred_int)
print(f'Accuracy of logistic regression model: {logistic_accuracy_int}')

In [None]:
print(classification_report(y_test.values, y_pred_int))

In [None]:
scores_int = logistic_reg_model_interaction.predict_proba(X_test_int)[:, 1]
c_index_X_test_int = cindex(y_test.values, scores_int)
print(f"c-index on test set is {c_index_X_test_int:.4f}")

In [None]:
int_coeffs = pd.DataFrame(data = logistic_reg_model_interaction.coef_, columns = X_train_int.columns)
int_coeffs.T.plot.bar();

# Random Forest Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, stratify=y)
sc = StandardScaler()
X_train_rf = X_train.copy()
X_test_rf = X_test.copy()
X_train_rf[standardize_features] = sc.fit_transform(X_train_rf[standardize_features])
X_test_rf[standardize_features] = sc.transform(X_test_rf[standardize_features])

In [None]:
def holdout_grid_search(clf, X_train_hp, y_train_hp, X_val_hp, y_val_hp, hyperparams, fixed_hyperparams={}):
    '''
    Conduct hyperparameter grid search on hold out validation set. Use holdout validation.
    Hyperparameters are input as a dictionary mapping each hyperparameter name to the
    range of values they should iterate over. Use the cindex function as your evaluation
    function.
    '''
    best_estimator = None
    best_hyperparams = {}
    best_score = 0.0

    lists = hyperparams.values()
    
    # get all param combinations
    param_combinations = list(itertools.product(*lists))
    total_param_combinations = len(param_combinations)

    for i, params in enumerate(param_combinations, 1):
        param_dict = {}
        for param_index, param_name in enumerate(hyperparams):
            param_dict[param_name] = params[param_index]
            
        # create estimator with specified params
        estimator = clf(**param_dict, **fixed_hyperparams)

        # fit estimator
        estimator.fit(X_train_hp, y_train_hp)
        
        # get predictions on validation set
        preds = estimator.predict_proba(X_val_hp)
        
        # compute cindex for predictions
        estimator_score = cindex(y_val_hp, preds[:,1])

        print(f'[{i}/{total_param_combinations}] {param_dict}')
        print(f'Val C-Index: {estimator_score}\n')

        # if new high score, update high score, best estimator
        # and best params 
        if estimator_score >= best_score:
                best_score = estimator_score
                best_estimator = estimator
                best_hyperparams = param_dict

    # add fixed hyperparamters to best combination of variable hyperparameters
    best_hyperparams.update(fixed_hyperparams)
    
    return best_estimator, best_hyperparams

In [None]:
def random_forest_grid_search(X_train_dropped, y_train_dropped, X_val_dropped, y_val_dropped):

    hyperparams = { 'n_estimators':[100,1000,5000], 'max_depth': [4,5,6], 'min_samples_leaf': [0.1, 0.15, 0.2]}
    fixed_hyperparams = {'random_state': 10,} 
    rf = RandomForestClassifier

    best_rf, best_hyperparams = holdout_grid_search(rf, X_train_dropped, y_train_dropped,
                                                    X_val_dropped, y_val_dropped, hyperparams,
                                                    fixed_hyperparams)

    print(f"Best hyperparameters:\n{best_hyperparams}")

    
    y_train_best = best_rf.predict_proba(X_train_dropped)[:, 1]
    print(f"Train C-Index: {cindex(y_train_dropped, y_train_best)}")

    y_val_best = best_rf.predict_proba(X_val_dropped)[:, 1]
    print(f"Val C-Index: {cindex(y_val_dropped, y_val_best)}")
    
    # add fixed hyperparamters to best combination of variable hyperparameters
    best_hyperparams.update(fixed_hyperparams)
    
    return best_rf, best_hyperparams

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

random_forest = RandomForestClassifier(n_estimators=100, random_state=10)
random_forest.fit(X_train_rf, y_train)

gscv = GridSearchCV(estimator=random_forest, param_grid={"n_estimators":[100,500,1000,5000],
                                                   'max_depth': [4,6,8],
                                                   "criterion":["gini","entropy"]},
                    cv=5,n_jobs=-1,scoring="f1_weighted")

gscv.fit(X_train_rf,y_train)
random_forest = gscv.best_estimator_

Evaluate model

In [None]:
y_pred_rf = random_forest.predict(X_test_rf)
print(classification_report(y_test, y_pred_rf))

In [None]:
random_forest

Evaluate model using c-index

In [None]:
y_pred_rf_train = random_forest.predict_proba(X_train_rf)[:, 1]
print(f"Train C-Index: {cindex(y_train.values, y_pred_rf_train)}")

y_pred_rf_prob = random_forest.predict_proba(X_test_rf)[:, 1]
print(f"Test C-Index: {cindex(y_test.values, y_pred_rf_prob)}")

In [None]:
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve

plot_roc_curve(random_forest, X_test_rf, y_test)
plt.show()

In [None]:
plot_precision_recall_curve(random_forest, X_test_rf, y_test)
plt.show()

In [None]:
plot_confusion_matrix(random_forest, X_test_rf, y_test)  
plt.ylabel("Actual")
plt.xlabel("Prediction");

# Risk Models that vary with time

Previously, we dropped the time column to predict the risk of death.

Let's look into probability of survival past any time t for an individual patient. 
This is a time-to-event data. In order to be able to model death risk from heart disease, we need to be able to represent the data in a form which we can process. The primary challenge is censored observations, which is a particular form of missing data. 

When we're dealing with time-to-event data, we want to answer a different question. We want to answer the question, what is the probability of survival? Not just past a fixed amount of years, but past any time t for an individual patient based on his/her features.

The column `time` states how long the patient lived before they died or were censored.

The column `DEATH_EVENT` says whether a death was observed or not. `DEATH_EVENT` is 1 if the event is observed (i.e. the patient died) and 0 if data was censored.

Censorship here means that the observation has ended without any observed event.
For example, let a patient be in a hospital for 100 days at most. If a patient dies after only 44 days, their event will be recorded as `time = 44` and `DEATH_EVENT = 1`. If a patient walks out after 100 days and dies 3 days later (103 days total), this event is not observed in our process and the corresponding row has `time = 100` and `DEATH_EVENT = 0`. If a patient survives for 25 years after being admitted, their data for are still `time = 100` and `DEATH_EVENT = 0`.

In [None]:
dataset.head()

# Censored Data

We can plot a histogram of the survival times to see in general how long cases survived before censorship or events.

In [None]:
dataset.time.hist();
plt.xlabel("Observation time before death or censorship (days)");
plt.ylabel("Frequency (number of patients)");

Distribution for censored and uncensored patients

In [None]:
df_censored = dataset[dataset.DEATH_EVENT == 0]
df_uncensored = dataset[dataset.DEATH_EVENT == 1]

df_censored.time.hist()
plt.title("Censored")
plt.xlabel("Time (days)")
plt.ylabel("Frequency")
plt.show()

df_uncensored.time.hist()
plt.title("Uncensored")
plt.xlabel("Time (days)")
plt.ylabel("Frequency")
plt.show()

# Cox Proportional Hazards

Our goal is to build a risk score using the survival data that we have. We'll fit a Cox Proportional Hazards model to the data.

Cox Proportional Hazards model describes the hazard for an individual $i$ at time $t$ as 

$$
\lambda(t, x) = \lambda_0(t)e^{\theta^T X_i}
$$

The $\lambda_0$ term is a baseline hazard and incorporates the risk over time, and the other term incorporates the risk due to the individual's covariates. After fitting the model, we can rank individuals using the person-dependent risk term $e^{\theta^T X_i}$. 

In [None]:
df_dev, df_test = train_test_split(dataset, test_size = 0.2)
df_train, df_val = train_test_split(df_dev, test_size = 0.25) 

In [None]:
df_train.head()

Import Cox proportional hazard model from lifelines package

In [None]:
!pip install lifelines

from lifelines import CoxPHFitter
from lifelines.utils import concordance_index as cindex

In [None]:
standardize_features = ['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium']

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_train[standardize_features] = sc.fit_transform(df_train[standardize_features])
df_val[standardize_features] = sc.transform(df_val[standardize_features])
df_test[standardize_features] = sc.transform(df_test[standardize_features])

In [None]:
df_train.describe()

In [None]:
cph = CoxPHFitter()
cph.fit(df_train, duration_col = 'time', event_col = 'DEATH_EVENT', step_size=0.01, show_progress = True)

In [None]:
cph.print_summary()

# Evaluate using Harrell's C-Index

To evaluate how good our model is performing, we will use a C-index in the survival context of the probability that, given a randomly selected pair of individuals, the one who died sooner has a higher risk score. 

However, we need to take into account censoring. Imagine a pair of patients, $A$ and $B$. 

#### Scenario 1
- A was censored at time $t_A$ 
- B died at $t_B$
- $t_A < t_B$. 

Because of censoring, we can't say whether $A$ or $B$ should have a higher risk score. 

#### Scenario 2
Now imagine that $t_A > t_B$.

- A was censored at time $t_A$ 
- B died at $t_B$
- $t_A > t_B$

Now we can definitively say that $B$ should have a higher risk score than $A$, since we know for a fact that $A$ lived longer. 

Therefore, when we compute our C-index
- We should only consider pairs where at most one person is censored
- If they are censored, then their censored time should occur *after* the other person's time of death. 

The metric we get if we use this rule is called **Harrel's C-index**.

Note that in this case, being censored at time $t$ means that the true death time was some time AFTER time $t$ and not at $t$. 
- Therefore if $t_A = t_B$ and A was censored:
    - Then $A$ actually lived longer than $B$. 

In [None]:
def harrell_c(y_true, scores, event):
    '''Compute Harrel C-index given true event/censoring times,
    model output, and event indicators.'''
    
    n = len(y_true)
    assert (len(scores) == n and len(event) == n)
    
    concordant = 0.0
    permissible = 0.0
    ties = 0.0
    result = 0.0
    
    for i in range(n):
        for j in range(i+1, n):
            
            # check if at most one is censored
            if event[i] == 1 or event[j] == 1:
            
                # check if neither are censored
                if event[i] == 1 and event[j] == 1:
                    permissible += 1
                    
                    # check if scores are tied
                    if (scores[i] == scores[j]):
                        ties += 1
                    
                    # check for concordant
                    elif (y_true[i] < y_true[j]) and (scores[i] > scores[j]):
                        concordant += 1
                    elif (y_true[i] > y_true[j]) and (scores[i] < scores[j]):
                        concordant += 1
                
                # check if one is censored
                elif event[i] != event[j]:
                    
                    # get censored index
                    censored = j
                    uncensored = i
                    
                    if event[i] == 0:
                        censored = i
                        uncensored = j
                        
                    # check if permissible
                    if y_true[uncensored] <= y_true[censored]:
                        permissible += 1
                        
                        # check if scores are tied
                        if (scores[censored] == scores[uncensored]):
                            # update ties 
                            ties += 1
                            
                        # check if scores are concordant 
                        if scores[uncensored] > scores[censored]:
                            concordant += 1
   
    result = (concordant + (0.5 * ties)) / permissible    
    return result   

In [None]:
# Train
scores = cph.predict_partial_hazard(df_train)
cox_train_scores = harrell_c(df_train['time'].values, scores.values, df_train['DEATH_EVENT'].values)
# Validation
scores = cph.predict_partial_hazard(df_val)
cox_val_scores = harrell_c(df_val['time'].values, scores.values, df_val['DEATH_EVENT'].values)
# Test
scores = cph.predict_partial_hazard(df_test)
cox_test_scores = harrell_c(df_test['time'].values, scores.values, df_test['DEATH_EVENT'].values)

print("Train:", cox_train_scores)
print("Val:", cox_val_scores)
print("Test:", cox_test_scores)

We can compare the predicted risk curves for any variable such as smoking. 

In [None]:
cph.plot_partial_effects_on_outcome('smoking', values=[0, 1]);

We see that the group with smoking has a lower survival rate at all times (the x-axis is time) compared to the treatment group.