In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
df.head()


In [None]:
df.isna().sum()

In [None]:
df.target.value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
sns.countplot(x='target',data=df)

In [None]:
df.groupby('target').mean()

In [None]:
plt.scatter(x=df.age[df.target==1], y=df.thalach[(df.target==1)], c="red")
plt.scatter(x=df.age[df.target==0], y=df.thalach[(df.target==0)])
plt.legend(["Disease", "Not Disease"])
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")
plt.show()

In [None]:
a=pd.get_dummies(df['cp'],prefix='cp')
b=pd.get_dummies(df['thal'],prefix='thal')
c=pd.get_dummies(df['slope'],prefix='slope')

In [None]:
frames=[df, a, b, c]
df=pd.concat(frames,axis=1)
df.head()

In [None]:
df = df.drop(columns = ['cp', 'thal', 'slope'])
df.head()

In [None]:
df.dtypes

In [None]:
X=df.drop(['target'],axis=1)
y=df['target'].values
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=(X-np.min(X))/(np.max(X)-np.min(X))

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train.shape

In [None]:
import random
import os

In [None]:
def seed_all(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED']=str(seed)
seed=42
seed_all(seed)

In [None]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression(solver='liblinear')
log_reg.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred=clf.predict(X_train)
        print(f'Accuracy:{accuracy_score(y_train,pred)*100:.2f}%')
        print(f'Precision Score :{precision_score(y_train,pred)*100:.2f}%')
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
    elif train==False:
        pred=clf.predict(X_test)
        print(f'Accuracy:{accuracy_score(y_test,pred)*100:.2f}%')
        print(f'Precision Score :{precision_score(y_test,pred)*100:.2f}%')
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [None]:
print_score(log_reg, X_train, y_train, X_test, y_test, train=True)
print_score(log_reg, X_train, y_train, X_test, y_test, train=False)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(X_train,y_train)
print_score(knn, X_train, y_train, X_test, y_test, train=True)
print_score(knn, X_train, y_train, X_test, y_test, train=False)

In [None]:
from sklearn.svm import SVC
svm_model=SVC(kernel='rbf', gamma=0.1, C=1.0)
svm_model.fit(X_train,y_train)
print_score(svm_model, X_train, y_train, X_test, y_test, train=True)
print_score(svm_model, X_train, y_train, X_test, y_test, train=False)

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier(random_state=42)
tree.fit(X_train,y_train)
print_score(tree, X_train, y_train, X_test, y_test, train=True)
print_score(tree, X_train, y_train, X_test, y_test, train=False)

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)

print_score(xgb, X_train, y_train, X_test, y_test, train=True)
print_score(xgb, X_train, y_train, X_test, y_test, train=False)

In [None]:
from sklearn.model_selection import GridSearchCV
param = {'C':np.logspace(-4,4,20),'solver':['liblinear']}
log_reg=LogisticRegression()
grid_search_cv=GridSearchCV(log_reg,param,scoring='accuracy',n_jobs=-1,verbose=1,cv=5,iid=True
                           )

In [None]:
grid_search_cv.fit(X_train,y_train)

In [None]:
grid_search_cv.best_estimator_

In [None]:
log_reg = LogisticRegression(C=0.615848211066026, 
                             solver='liblinear')
log_reg.fit(X_train,y_train)  
print_score(log_reg, X_train, y_train, X_test, y_test, train=True)
print_score(log_reg, X_train, y_train, X_test, y_test, train=False)

In [None]:
train_score=[]
test_score=[]
neighbors=range(1,21)
for k in neighbors:
    model=KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train,y_train)
    train_score.append(accuracy_score(y_train,model.predict(X_train)))
    test_score.append(accuracy_score(y_test,model.predict(X_test)))

In [None]:
plt.figure(figsize=(12,8))
plt.plot(neighbors,train_score,label='Train score')
plt.plot(neighbors,test_score,label='Test score')
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_score)*100:.2f}%")

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=18)
knn_classifier.fit(X_train, y_train)

print_score(knn_classifier, X_train, y_train, X_test, y_test, train=True)
print_score(knn_classifier, X_train, y_train, X_test, y_test, train=False)