Question#1: Use heart_failure_clinical_records_dataset.csv dataset, try as many classifiers as you can to predict the likelihood of the death event. Similarly, experiment with the different loss functions, regularizers, and hyperparameters. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from csv import reader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
#from sklearn.metrics import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
#loading file
filename = 'data/heart_failure_clinical_records_dataset.csv' 
dataset = pd.read_csv(filename, header=0)
dataset

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [3]:
X = dataset.iloc[: , :-1]
y=dataset.iloc[:, -1]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((239, 12), (60, 12))

# KNN Algorithm with Grid Search

In [5]:
knn = KNeighborsClassifier()
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)

In [6]:
grid_search=grid.fit(X_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


In [7]:
print(grid_search.best_params_)

{'n_neighbors': 9}


In [8]:
accuracy = grid_search.best_score_ *100
print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )

Accuracy for our training dataset with tuning is : 71.52%


In [13]:
#KNN Algorith
neigh = KNeighborsClassifier(n_neighbors=9).fit(X_train, y_train)

In [14]:
y_pred_knn = neigh.predict(X_test)
accuracy_knn = accuracy_score(y_test,y_pred_knn)
accuracy_knn

0.55

In [15]:
neigh.predict_proba(X_test[:5])

array([[0.33333333, 0.66666667],
       [0.55555556, 0.44444444],
       [0.77777778, 0.22222222],
       [0.77777778, 0.22222222],
       [0.77777778, 0.22222222]])

In [16]:
y_pred_knn[:5]

array([1, 0, 0, 0, 0], dtype=int64)

# SVM with gridSearchCV

In [17]:
param_grid = [{'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
            'C':np.logspace(-4,4,20),
            'gamma':['scale', 'auto'],
            'decision_function_shape':['ovo','ovr'],
               'max_iter':[-1]
            }]

In [18]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf_svm = make_pipeline(StandardScaler(), 
                    GridSearchCV(SVC(),
                                 param_grid=param_grid,
                                 cv=3,
                                 verbose=True,n_jobs=-1))

In [19]:
best_clf_svm = clf_svm.fit(X_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


In [20]:
y_pred_svm = best_clf_svm.predict(X_test)
y_pred_svm

array([0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1], dtype=int64)

In [21]:
accuracy_svm = accuracy_score(y_test,y_pred_svm)
accuracy_svm

0.7

In [22]:
print(classification_report(y_test,y_pred_svm))

              precision    recall  f1-score   support

           0       0.71      0.83      0.76        35
           1       0.68      0.52      0.59        25

    accuracy                           0.70        60
   macro avg       0.70      0.67      0.68        60
weighted avg       0.70      0.70      0.69        60



# Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred_nb = classifier.predict(X_test)
y_proba_nb = classifier.predict_proba(X_test)

In [24]:
y_proba_nb[:10] , y_pred_nb

(array([[0.98134239, 0.01865761],
        [0.99605067, 0.00394933],
        [0.71242568, 0.28757432],
        [0.05336913, 0.94663087],
        [0.87140794, 0.12859206],
        [0.99375994, 0.00624006],
        [0.60198526, 0.39801474],
        [0.14734505, 0.85265495],
        [0.0518336 , 0.9481664 ],
        [0.87735593, 0.12264407]]),
 array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int64))

In [25]:
acc_naive_bayes = accuracy_score(y_test,y_pred_nb)
print("Test set Accuracy: " ,acc_naive_bayes)

Test set Accuracy:  0.7333333333333333


In [26]:
print(classification_report(y_test,y_pred_nb))

              precision    recall  f1-score   support

           0       0.69      0.97      0.81        35
           1       0.91      0.40      0.56        25

    accuracy                           0.73        60
   macro avg       0.80      0.69      0.68        60
weighted avg       0.78      0.73      0.70        60



# Decision Tree

In [27]:
from sklearn import tree
clf_tree = tree.DecisionTreeClassifier(max_depth = 4, random_state = 42).fit(X_train, y_train)

In [28]:
y_pred_tree = clf_tree.predict(X_test)
y_proba_tree = clf_tree.predict_proba(X_test)
y_pred_tree, y_proba_tree[:10]

(array([1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64),
 array([[0.09090909, 0.90909091],
        [0.98387097, 0.01612903],
        [0.82352941, 0.17647059],
        [0.06976744, 0.93023256],
        [0.98387097, 0.01612903],
        [0.82352941, 0.17647059],
        [0.82352941, 0.17647059],
        [0.98387097, 0.01612903],
        [0.30769231, 0.69230769],
        [0.82352941, 0.17647059]]))

In [29]:
# Accuracy Score
acc_tree = accuracy_score(y_test,y_pred_tree)
acc_tree

0.6833333333333333

#GridSearchCV

In [30]:
tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
dt_GS = GridSearchCV(tree.DecisionTreeClassifier(), tree_para, cv=5).fit(X_train, y_train)

In [31]:
print(dt_GS.best_params_)

{'criterion': 'entropy', 'max_depth': 5}


In [32]:
y_pred_tree_GS = dt_GS.predict(X_test)
y_proba_tree_GS = dt_GS.predict_proba(X_test)
y_proba_tree_GS [:10], y_pred_tree_GS

(array([[0.61538462, 0.38461538],
        [0.9921875 , 0.0078125 ],
        [0.9921875 , 0.0078125 ],
        [0.        , 1.        ],
        [0.9921875 , 0.0078125 ],
        [0.9921875 , 0.0078125 ],
        [0.61538462, 0.38461538],
        [0.9921875 , 0.0078125 ],
        [0.        , 1.        ],
        [0.        , 1.        ]]),
 array([0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1], dtype=int64))

In [33]:
# Accuracy Score
acc_tree = accuracy_score(y_test,y_pred_tree_GS)
acc_tree

0.7333333333333333

In [34]:
print(classification_report(y_test,y_pred_tree_GS))

              precision    recall  f1-score   support

           0       0.72      0.89      0.79        35
           1       0.76      0.52      0.62        25

    accuracy                           0.73        60
   macro avg       0.74      0.70      0.71        60
weighted avg       0.74      0.73      0.72        60



# Logistic Regression

In [35]:
#l2 Regulizer on Logistic Regression
log_reg = LogisticRegression(max_iter=100,penalty='l2',C=1,solver='liblinear').fit(X_train, y_train)

In [36]:
#Testing data
y_pred_logistic= log_reg.predict(X_test)
y_proba_logistic = log_reg.predict_proba(X_test)
y_pred_logistic, y_proba_logistic[:10]

(array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int64),
 array([[0.96950251, 0.03049749],
        [0.97362874, 0.02637126],
        [0.77876698, 0.22123302],
        [0.19326678, 0.80673322],
        [0.67860752, 0.32139248],
        [0.97219835, 0.02780165],
        [0.54760302, 0.45239698],
        [0.68654099, 0.31345901],
        [0.17668529, 0.82331471],
        [0.8203171 , 0.1796829 ]]))

In [37]:
accuracy_logistic = accuracy_score(y_test,y_pred_logistic)
accuracy_logistic

0.75

In [38]:
print(classification_report(y_test,y_pred_logistic))

              precision    recall  f1-score   support

           0       0.72      0.94      0.81        35
           1       0.86      0.48      0.62        25

    accuracy                           0.75        60
   macro avg       0.79      0.71      0.72        60
weighted avg       0.78      0.75      0.73        60



In [39]:
#with GRIDSEARCHCV

In [40]:
solver_options = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
multi_class_options = ['ovr', 'multinomial']
class_weight_options = ['None', 'balanced']
penalty=['l2']
max_iter=[100]

param_grid = dict(penalty= penalty, solver = solver_options, multi_class = 
multi_class_options, class_weight = class_weight_options, max_iter = max_iter)

In [41]:
lr = LogisticRegression()
lr_grid = GridSearchCV(lr, param_grid, cv=12, scoring = 'accuracy').fit(X_train, y_train)







STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Traceback (most recent call last):
  File "C:\Users\srija\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\srija\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1406, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\_pa

Traceback (most recent call last):
  File "C:\Users\srija\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\srija\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1406, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\_pa

Traceback (most recent call last):
  File "C:\Users\srija\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\srija\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1406, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "C:\Users\srija\anaconda3\lib\site-packages\joblib\_pa







STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

        nan        nan        nan        nan 0.81140351 0.76929825
 0.82390351 0.69890351 0.70307018 0.81140351 0.77346491        nan
 0.69890351 0.69890351]


In [42]:
print(lr_grid.best_params_)

{'class_weight': 'None', 'max_iter': 100, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'newton-cg'}


In [43]:
#Testing data
y_pred_logistic_GS= lr_grid.predict(X_test)
y_proba_logistic_GS = lr_grid.predict_proba(X_test)
y_pred_logistic_GS, y_proba_logistic_GS[:10]

(array([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1], dtype=int64),
 array([[9.39047535e-01, 6.09524650e-02],
        [9.95981082e-01, 4.01891785e-03],
        [8.83648224e-01, 1.16351776e-01],
        [3.69154422e-04, 9.99630846e-01],
        [8.60849100e-01, 1.39150900e-01],
        [9.82960877e-01, 1.70391225e-02],
        [4.19718425e-01, 5.80281575e-01],
        [9.26858660e-01, 7.31413400e-02],
        [5.08815454e-02, 9.49118455e-01],
        [8.86615073e-01, 1.13384927e-01]]))

In [44]:
accuracy_logistic = accuracy_score(y_test,y_pred_logistic_GS)
accuracy_logistic

0.8

In [45]:
print(classification_report(y_test,y_pred_logistic_GS))

              precision    recall  f1-score   support

           0       0.77      0.94      0.85        35
           1       0.88      0.60      0.71        25

    accuracy                           0.80        60
   macro avg       0.82      0.77      0.78        60
weighted avg       0.82      0.80      0.79        60



# Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier

random_forest_clf = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)

In [47]:
#Testing data
y_pred_rf= random_forest_clf.predict(X_test)
y_proba_rf = random_forest_clf.predict_proba(X_test)
y_pred_rf, y_proba_rf[:10]

(array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int64),
 array([[0.5872387 , 0.4127613 ],
        [0.85596297, 0.14403703],
        [0.83816535, 0.16183465],
        [0.32718389, 0.67281611],
        [0.83936355, 0.16063645],
        [0.84641041, 0.15358959],
        [0.74238687, 0.25761313],
        [0.82874877, 0.17125123],
        [0.30932033, 0.69067967],
        [0.70767869, 0.29232131]]))

In [48]:
accuracy_rf = accuracy_score(y_test,y_pred_rf)
accuracy_rf

0.75

In [49]:
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.70      1.00      0.82        35
           1       1.00      0.40      0.57        25

    accuracy                           0.75        60
   macro avg       0.85      0.70      0.70        60
weighted avg       0.82      0.75      0.72        60



#SVM with GridSearchCV Bayes- 70%
#Naive Bayes- 73%
#Decision Tree with GridSearchCV- 73.33%
#Random Forest- 75%

# LOGISTIC REGRESSION gives better accuracy of 80%