### Import packages

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report

In [None]:
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

In [None]:
X=df.iloc[:,:-1]
y=df.DEATH_EVENT

In [None]:
X_corr=X.corr()
mask = np.zeros_like(X_corr)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(X_corr, annot=True, fmt='.2f', mask=mask)
plt.show()

In [None]:
df.isnull().values.any()

### EDA

In [None]:
Corr = df.corr()
Corr

In [None]:
sns.heatmap(Corr,vmin=0, vmax=1, center=0,
            square=True, linewidths=1, cbar_kws={"shrink": .5})

In [None]:
X = df.iloc[:,0:13]  
y = df.iloc[:,-1]
#apply SelectKBest class to extract best features
parameters = SelectKBest(score_func=chi2, k=13)
fit = parameters.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(13,'Score'))  #print 10 best features

In [None]:
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(13).plot(kind='barh')
plt.show()

In [None]:
df['anaemia'].replace(1,'Anemic',inplace = True)
df['anaemia'].replace(0,'Non-Anemic',inplace = True)
df['diabetes'].replace(1,'Diabetic',inplace = True)
df['diabetes'].replace(0,'Non-Diabetic',inplace = True)
df['smoking'].replace(1,'Smoker',inplace = True)
df['smoking'].replace(0,'Non-Smoker',inplace = True)
df['high_blood_pressure'].replace(1,'Hypertension',inplace = True)
df['high_blood_pressure'].replace(0,'Other',inplace = True)
df['sex'].replace(1,'Male',inplace = True)
df['sex'].replace(0,'Female',inplace = True)
df['DEATH_EVENT'].replace(1,'Heart Attack',inplace = True)
df['DEATH_EVENT'].replace(0,'Alive',inplace = True)

In [None]:
plt.hist(df['age'])

In [None]:
plt.hist(df['ejection_fraction'])

In [None]:
plt.hist(df['serum_sodium'])

In [None]:
plt.hist(df['serum_creatinine'])

In [None]:
#pip install plotly

In [None]:
import plotly.express as px
fig = px.pie(df, names='DEATH_EVENT', title="Number of Deaths")
fig.show()

In [None]:
ds = df['sex']
ds = ds.to_frame()
ds['DEATH_EVENT'] = df['DEATH_EVENT']
dx = ds.value_counts().reset_index()
dx.columns = ['Sex','DEATH_EVENT', 'Count']

fig = px.bar(dx,x="Sex",y="Count",color="DEATH_EVENT",title="Sex and Heart Attack")
fig.show()

In [None]:
fig = px.histogram(df,x='age',nbins=50,color='DEATH_EVENT',barmode = 'relative',title=('Age & Heart Attack Distribution'))
fig.update_layout(title_x = 0.5)
fig.show()

In [None]:
fig = px.pie(df, names='diabetes', title="Diabetic Distribution")
fig.show()

In [None]:
import scipy
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
# Trial and Error revealed that not considering Age column improves accuracy with correlation

x = df[['ejection_fraction', 'serum_creatinine', 'serum_sodium', 'time']]
y = df['DEATH_EVENT']

#Spliting data into training and testing data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.2)

### LOGISTIC REGRESSION

In [None]:
lr=LogisticRegression()
lr.fit(x_train,y_train)
p1=lr.predict(x_test)
s1=accuracy_score(y_test,p1)
print("Logistic Regression Success Rate :", "{:.2f}%".format(100*s1))
plot_confusion_matrix(lr, x_test, y_test)
print(classification_report(y_test,p1))

plt.show()

### Finding best parameters


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve, GridSearchCV, validation_curve




In [None]:
import numpy as np
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
std_slc = StandardScaler()
pca = decomposition.PCA()
logistic_Reg = linear_model.LogisticRegression()
pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('logistic_Reg', logistic_Reg)])
n_components = list(range(1,x.shape[1]+1,1))
C = np.logspace(-4, 4, 50)
penalty = ['l1', 'l2']
parameters = dict(pca__n_components=n_components,
                      logistic_Reg__C=C,
                      logistic_Reg__penalty=penalty)
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(x_train, y_train)
print('Best Penalty:', clf_GS.best_estimator_.get_params()['logistic_Reg__penalty'])
print('Best C:', clf_GS.best_estimator_.get_params()['logistic_Reg__C'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['logistic_Reg'])

### After Best Parameter

In [None]:
lr=LogisticRegression(C=0.5689866029018293, penalty = 'l2')
lr.fit(x_train,y_train)
pred_best=lr.predict(x_test)
s1=accuracy_score(y_test,pred_best)
print("Logistic Regression Success Rate :", "{:.2f}%".format(100*s1))
plot_confusion_matrix(lr, x_test, y_test)
print(classification_report(y_test,pred_best))

plt.show()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve, GridSearchCV, validation_curve
param_range = [0.5689866029018293]

plt.figure(figsize=(15, 10))

# Apply logistic regression model to training data
lr = LogisticRegression(C=0.5689866029018293)

#Plot validation curve
train_scores, test_scores = validation_curve(estimator=lr ,X=x
                                                            ,y=y
                                                            ,param_name='C'
                                                            ,param_range=param_range
                                                            )

train_mean = np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)
test_std = np.std(test_scores,axis=1)

plt.subplot(2,2,1)
plt.plot(param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
    
plt.plot(param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
    
plt.xlabel('C_parameter')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.5,1])

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
rfclassifier = RandomForestClassifier()
                                   
rfcmodel = rfclassifier.fit(x_train, y_train)
y_pred = rfcmodel.predict(x_test)
final_acc =accuracy_score(y_test,y_pred)
print("Random Forest Classifier Success Rate :", "{:.6f}%".format(100*final_acc))
plot_confusion_matrix(rfcmodel, x_test, y_test)
print(classification_report(y_test,y_pred))

In [None]:
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperparameters = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True)


In [None]:
grid_search = GridSearchCV(rfc, hyperparameters, cv = 3, verbose = 1,n_jobs = -1)
best_params = grid_search.fit(x_train, y_train)

In [None]:
best_params

In [None]:
rfclassifier = RandomForestClassifier(random_state = 1, max_depth = 5,     n_estimators = 10)
                                   
rfcmodel = rfclassifier.fit(x_train, y_train)
y_pred = rfcmodel.predict(x_test)
final_acc =accuracy_score(y_test,y_pred)
print("Random Forest Classifier Success Rate :", "{:.6f}%".format(100*final_acc))
plot_confusion_matrix(rfcmodel, x_test, y_test)
print(classification_report(y_test,y_pred))

### SUPPORT VECTOR MACHINE WITH HYPER PARAMETERS

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
f_svm=SVC(kernel = 'linear')
f_svm.fit(x_train,y_train)
predsvc=f_svm.predict(x_test)
f_acc=accuracy_score(y_test,predsvc)
print("Support Vector Machine Success Rate :", "{:.6f}%".format(100*f_acc))
plot_confusion_matrix(f_svm, x_test, y_test)
print(classification_report(y_test, predsvc))
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf','linear']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(x_train, y_train) 

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
f_svm=SVC(C=10, gamma=0.0001, kernel = 'rbf')
f_svm.fit(x_train,y_train)
predsvc=f_svm.predict(x_test)
f_acc=accuracy_score(y_test,predsvc)
print("Support Vector Machine Success Rate :", "{:.6f}%".format(100*f_acc))
plot_confusion_matrix(f_svm, x_test, y_test)
print(classification_report(y_test, predsvc))
plt.show()

### KNEIGHBORS

### WITHOUT TUNING

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
pred_knn=knn.predict(x_test)
knn_accuracy=accuracy_score(y_test,pred_knn)
print("The Model Accuracy is:",knn_accuracy*100,"%")    

plot_confusion_matrix(knn, x_test, y_test)
print(classification_report(y_test,pred_knn))
plt.show()

### WITH TUNING

In [None]:
from sklearn.neighbors import KNeighborsClassifier
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2,3]
#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Create new KNN object
knn_2 = KNeighborsClassifier()
#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)
#Fit the model
best_model = clf.fit(x_train,y_train)
#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier(n_neighbors=5, leaf_size=1,p=1)
knn.fit(x_train,y_train)
pred_knn=knn.predict(x_test)
knn_accuracy=accuracy_score(y_test,pred_knn)
print("The Model Accuracy is:",knn_accuracy*100,"%")    

plot_confusion_matrix(knn, x_test, y_test)
print(classification_report(y_test,pred_knn))
plt.show()

### DECISIONTREE CLASSIFIER

### WITHOUT TUNING

In [None]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
DTAcc= accuracy_score(y_test,y_pred)*100
print("The Accuracy of model is",DTAcc)
plot_confusion_matrix(classifier, x_test, y_test)
print(classification_report(y_test,y_pred))
plt.show()


### WITH TUNING

In [None]:
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()
pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])
n_components = list(range(1,X.shape[1]+1,1))
criterion = ['gini', 'entropy']
max_depth = [2,4,6,8,10,12]
parameters = dict(pca__n_components=n_components,
                      dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth)
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(x, y)
print('Best Criterion:', clf_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['dec_tree'])

In [None]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion='entropy', max_depth=2)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
DTAcc= accuracy_score(y_test,y_pred)*100
print("The Accuracy of model is",DTAcc)
plot_confusion_matrix(classifier, x_test, y_test)
print(classification_report(y_test,y_pred))
plt.show()


### FINAL ACCURACIES

In [None]:
print("Logistic Regression Success Rate :", "{:.2f}%".format(100*s1))
print("Random Forest Classifier Success Rate :", "{:.2f}%".format(100*final_acc))
print("Support Vector Machine Success Rate :", "{:.2f}%".format(100*f_acc))
print("Kneighbors Success Rate:","{:.2f}%".format(knn_accuracy*100))    
print("Decision Tree Sucess Rate","{:.2f}%".format(DTAcc))