In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing and reading the data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv',
                 low_memory = False)
df.head()

# Preliminary checks on the data

In [None]:
len(df)

In [None]:
df.info()

In [None]:
df.dtypes

All the columns are numerical

In [None]:
df.isna().sum()

The data has no missing values

In [None]:
# Grouping the columns into numerical and categorical columns
num_cols = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_sodium','serum_creatinine', 'time']

# categorical columns
cat_cols =list( set(df.columns) - set(num_cols))

# change the data types to categories
for col in cat_cols:
    df[col] = df[col].astype('category')

# Check the dtypes again
df.dtypes

# Visualising the data

## ..Visualising the continuous variables

In [None]:
# Plot histogram for age
df.age.hist()

In [None]:
# Plot histogram for platelets
df.platelets.hist()

In [None]:
# Plot histogram for serum sodium
df.serum_sodium.hist()

In [None]:
# Plot histogram for serum creatinine
df.serum_creatinine.plot.hist()

## ...Visualising the categorical variables

In [None]:
df.sex.value_counts(normalize=True).plot.bar(rot = 0, title = 'Sex');

In [None]:
df.smoking.value_counts(normalize=True).plot.bar(rot=0, title = 'Smoking?');

In [None]:
df.anaemia.value_counts(normalize=True).plot.bar(rot = 0, title = 'Has anaemia?');

## ..Checking for quick associations between categorical features

In [None]:
df.DEATH_EVENT.value_counts(normalize=True).plot.bar(rot = 0,
                                                     title = 'Death Event');

There is a clear class imbalance in the target column(DEATH_EVENT).Clearly, majority of the patients did not die.

This will inturn affect the evaluation metrics of interest in machine learning

In [None]:
pd.crosstab(df.sex, df.DEATH_EVENT, normalize='index').plot.bar(rot = 0);

In [None]:
pd.crosstab(df.high_blood_pressure,df.DEATH_EVENT, normalize='index').plot.bar(rot=0);

In [None]:
pd.crosstab(df.anaemia,df.DEATH_EVENT,normalize='index').plot.bar(rot=0);

In [None]:
pd.crosstab(df.diabetes,df.high_blood_pressure,normalize='all')#.plot.bar(rot=0)

The table above shows that only 37.5% of all the patients did not have any co-morbid condition. The majority had at least one of either high blood pressure or diabetes or both

# Preparing data for ML

In [None]:
# Employ the minmax_scale on the continous variables
from sklearn.preprocessing import minmax_scale

for col in num_cols:
    df[col] = minmax_scale(df[col])

In [None]:
# Preview changes in the df
df.head()

## Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and label (y)
X = df.iloc[:, :-1]
y  = df['DEATH_EVENT']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 94)

## Importing models

In [None]:
# import models and the cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## Model performance on cv_scores

In [None]:
from sklearn.model_selection import cross_val_score

# Set random seed
np.random.seed(94)

# A function to store the cross-validated scores on the training data
def model_metrics(models, X_train, y_train):
    
    # Create an empty dataframe to store the metrics for each model
    metrics_df = pd.DataFrame()
    
    # Loop through each model
    for name, model in models.items():
        
        # Compute and store cv_scores in a dictionary
        metrics_dict = {'accuracy' : round(np.mean(cross_val_score(model, X_train, y_train)), 2),
                        'precision' : round(np.mean(cross_val_score(model, X_train, y_train, scoring='precision')), 2),
                        'recall' : round(np.mean(cross_val_score(model, X_train, y_train, scoring='recall')), 2),
                        'f1' : round(np.mean(cross_val_score(model, X_train, y_train, scoring='f1')), 2)}
        
        # Add the scores to the dataframe
        metrics_df[name] = metrics_dict.values()
    
    # Set the indices of the dataframe 
    metrics_df.index = metrics_dict.keys()
    
    return metrics_df

In [None]:
# A dictionary to house the models
models = {'Logistic Regression' : LogisticRegression(),
          'Random Forest' : RandomForestClassifier()}

# Apply the model_metrics function on the models
cv_scores = model_metrics(models, X_train, y_train)

# Visualise the metrics dataframe
cv_scores

In [None]:
# Plot the metrics from the cv_scores
cv_scores.plot.bar(rot = 0, title = 'Plot of cross_val_scores on the training data');

## Assessing models on the test set

In [None]:
# Import evaluation metrics 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# np.random.seed(94)

# An empty dataFrame to house the results
test_metrics = pd.DataFrame()

# Loop through the models
for name, model in models.items():
    # Fit the model
    clf = model.fit(X_train, y_train)
    
    # Make predictions on the X_test
    y_preds = clf.predict(X_test)
    
    # Evaluate the predictions
    test_dict = {'accuracy' : accuracy_score(y_test, y_preds),
                 'precision': precision_score(y_test, y_preds),
                 'recall' : recall_score(y_test, y_preds),
                 'f1' : f1_score(y_test, y_preds)}
    
    # Add the evaluation metrics to the dataframe (test_metrics)
    test_metrics[name] = test_dict.values()

# Set the indices of the dataframe
test_metrics.index = test_dict.keys()

# View the test_metrics_df
test_metrics

In [None]:
# plot the test_metrics dataframe
test_metrics.plot.bar(rot = 0, title = 'Evaluation metrics on the test set');

Summary ofthe models performaces
