In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report as rep
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [None]:
heart_data=pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
heart_data.head()

In [None]:
heart_data.shape

In [None]:
heart_data.dtypes

In [None]:
heart_data.isnull().sum()

In [None]:
heart_data.describe()

In [None]:
heart_data.corr()

In [None]:
sns.countplot(heart_data['DEATH_EVENT'])

In [None]:
sc = MinMaxScaler()
d1= sc.fit_transform(heart_data)
data =pd.DataFrame(d1,columns=heart_data.columns)
data.head()

In [None]:
x=data.drop(['DEATH_EVENT'],axis=1) # features
y=data['DEATH_EVENT'] # response variable
x.shape,y.shape

In [None]:
train_x,test_x,train_y,test_y=train_test_split(x,y,random_state=56,stratify=y)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

Logistic Regression

In [None]:
logreg=LogisticRegression()
logreg.fit(train_x,train_y)
train_pred=logreg.predict(train_x)
test_pred=logreg.predict(test_x)
confusion_matrix=confusion_matrix(test_y,test_pred)
print(confusion_matrix)

In [None]:
print(rep(test_y,test_pred))

In [None]:
logreg_train_acc=accuracy_score(train_y,train_pred)
logreg_test_acc=accuracy_score(test_y,test_pred)
logreg_train_acc,logreg_test_acc

In [None]:
# logistic regression using 10 fold cross validation
logreg_score=cross_val_score(logreg,X=x,y=y,cv=10)
logreg_score.max()

In [None]:
plt.figure(figsize=(8,6),dpi=120,facecolor='w',edgecolor='b')
x=range(len(train_x.columns))
c=logreg.coef_.reshape(-1)
plt.bar(x,c)
plt.xlabel('Variables')
plt.ylabel('Coefficients')
plt.title('coefficient plot')

In [None]:
Coefficients=pd.DataFrame({
    'Variable':train_x.columns,
    'coefficient':abs(c)
})
Coefficients.head()

KNearestNeighbor Classifier 

In [None]:
# knn using k fold cross validation
from sklearn.model_selection import cross_val_score
def Val_score(n_neighbors):
    #takes range of n_neighbors as input 
    #Returns mean and standard deviation for each value of n_neighbors
    avg=[]
    std=[]
    for i in n_neighbors:
        # 10 fold cross validation for every value of n_neighbors
        score=cross_val_score(KNeighborsClassifier(n_neighbors=i),X=train_x,y=train_y,cv=10)
        avg.append(score.mean()) 
        std.append(score.std())
    return avg,std
n_neighbors=range(1,25)
mean,std=Val_score(n_neighbors)

In [None]:
plt.plot(n_neighbors,mean,color='green',label='mean')
plt.xlabel('n_neighbors')
plt.ylabel('Mean Score')
plt.title('Mean validation score')

In [None]:
plt.plot(n_neighbors,std,color='red',label='Standard deviation')
plt.xlabel('n_neighbors')
plt.ylabel('magnitude')
plt.title('Standard deviation of validation score')

In [None]:
clf=KNeighborsClassifier(n_neighbors=13)
clf.fit(train_x,train_y)
train_pred=clf.predict(train_x)
test_pred=clf.predict(test_x)
knn_train_acc=accuracy_score(train_y,train_pred)
knn_test_acc=accuracy_score(test_y,test_pred)
knn_train_acc,knn_test_acc

In [None]:
x=data.drop(['DEATH_EVENT'],axis=1) 
y=data['DEATH_EVENT'] 
knn_score=cross_val_score(clf,X=x,y=y,cv=10)
knn_score.max()

Support vector machine

In [None]:
sv=SVC(kernel='linear')
sv.fit(train_x,train_y)
train_pred=sv.predict(train_x)
test_pred=sv.predict(test_x)
sv_train_acc=accuracy_score(train_y,train_pred)
sv_test_acc=accuracy_score(test_y,test_pred)
sv_train_acc,sv_test_acc

In [None]:
x=data.drop(['DEATH_EVENT'],axis=1)
y=data['DEATH_EVENT'] 
sv_score=cross_val_score(sv,X=x,y=y,cv=10)
sv_score.max()

Decision Tree

In [None]:
dt_model=DecisionTreeClassifier(random_state=10)
dt_model.fit(train_x,train_y)
train_pred=dt_model.predict(train_x)
test_pred=dt_model.predict(test_x)
dt_train_acc=accuracy_score(train_y,train_pred)
dt_test_acc=accuracy_score(test_y,test_pred)
dt_train_acc,dt_test_acc

There is a large gap between train and test accuracy scores. Thus parameter tuning for decision tree is required.

In [None]:
train_accuracy=[]
validation_accuracy=[]
for depth in range(1,5):
    dt_model=DecisionTreeClassifier(max_depth=depth,random_state=10)
    dt_model.fit(train_x,train_y)
    train_accuracy.append(dt_model.score(train_x,train_y))
    validation_accuracy.append(dt_model.score(test_x,test_y))

frame=pd.DataFrame({'max depth':range(1,5),'train_acc':train_accuracy,'valid_acc':validation_accuracy})
plt.figure(figsize=(12,6))
plt.plot(frame['max depth'],frame['train_acc'],marker='o')
plt.plot(frame['max depth'],frame['valid_acc'],marker='o') 
plt.xlabel('depth of tree')
plt.ylabel('performance')
plt.legend()

At maximum depth = 3, highest validation score is obtained. 

In [None]:
dt_model=DecisionTreeClassifier(max_depth=3,random_state=10)
dt_model.fit(train_x,train_y)
train_pred=dt_model.predict(train_x)
test_pred=dt_model.predict(test_x)
dt_train_acc=accuracy_score(train_y,train_pred)
dt_test_acc=accuracy_score(test_y,test_pred)
dt_train_acc,dt_test_acc

In [None]:
x=data.drop(['DEATH_EVENT'],axis=1)
y=data['DEATH_EVENT'] 
dt_score=cross_val_score(dt_model,X=x,y=y,cv=10)
dt_score.max()

In [None]:
print("Accuracies :-")
print("Logistic Regression: ",logreg_test_acc)
print("KNearestNeighbor classifier: ",knn_test_acc)
print("SVM classifier: ",sv_test_acc)
print("Decision Tree: ",dt_test_acc)

In [None]:
print("Cross validation scores :-")
print("Logistic Regression: ",logreg_score.max())
print("KNearestNeighbor classifier: ",knn_score.max())
print("SVM classifier: ",sv_score.max())
print("Decision Tree: ",dt_score.max())