In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from matplotlib import pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

* Age : Age of the patient

* Sex : Sex of the patient

* exang: exercise induced angina (1 = yes; 0 = no)

* ca: number of major vessels (0-3)

* cp : Chest Pain type chest pain type

   - Value 1: typical angina
   - Value 2: atypical angina
   - Value 3: non-anginal pain
   - Value 4: asymptomatic
* trtbps : resting blood pressure (in mm Hg)

* chol : cholestoral in mg/dl fetched via BMI sensor

* fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

* rest_ecg : resting electrocardiographic results

   - Value 0: normal
   - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
   - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
* thalach : maximum heart rate achieved

* target : 0= less chance of heart attack 1= more chance of heart attack

# Import Data

In [None]:
heart_data=pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
heart_data.info()


In [None]:
heart_data.head()

In [None]:
heart_data.describe

In [None]:
#Size of dataset
heart_data.shape

In [None]:
#Check for null values
heart_data.isna().sum()

* We do not have any null values.

In [None]:
heart_data.head(10)

* It is important to understand the data

In [None]:
heart_data["caa"].value_counts()

# Visualization
* To see distrubution of data better
* To see outliers, if any
* What wonder i about data?

In [None]:
sns.pairplot(heart_data)

In [None]:
sns.scatterplot(x="chol",y="age",data=heart_data,hue="output")

* we have outliers

* It is hard to write scatter plot each time for all columns. For loop will be more useful

In [None]:
data_column=heart_data.drop(["age","fbs","sex","cp","restecg","exng","slp","caa","thall","output"],axis=1)
list=data_column.columns
for i in  list:
    sns.scatterplot(x="age",y=i,data=heart_data,hue="output")
    plt.show()
    


* By looking at the scatter plot, we can understand that it will not be efficient to use logistic regression on this dataset.

In [None]:
heart_data.head()

In [None]:
#en çok kadın mı erkek için mi tehlikeli
sns.displot(heart_data,x="sex",hue="output",multiple="stack")

In [None]:
heart_data["sex"].value_counts()

In [None]:
#fbs 120 den fazla olanların(1) ve chol kalp krizine etkisi
#Use catplot() to combine a countplot() and a FacetGrid. This allows grouping within additional categorical variables. 
#Using catplot() is safer than using FacetGrid directly, as it ensures synchronization of variable order across facets:
ax=sns.catplot(x="thall",hue="fbs",col="output",kind="count",data=heart_data)


In [None]:
hd_columns=heart_data.drop(["output","age","trtbps","chol","thalachh","oldpeak"],axis=1)
columns=hd_columns.columns
for i in columns:
    ax=sns.countplot(x=i,hue="output",data=heart_data)
    plt.show()

# Outlier Detection

In [None]:
max_threshold=heart_data["chol"].quantile(0.99)
max_threshold

In [None]:
heart_data[heart_data["chol"]>max_threshold]

In [None]:
min_threshold=heart_data["chol"].quantile(0.01)
min_threshold

In [None]:
df=heart_data[(heart_data["chol"]<max_threshold) & (heart_data["chol"]>min_threshold)]
df

In [None]:
df.sample(10)

In [None]:
df.head()

# Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
df_s=df.copy()

In [None]:
X=df_s.drop(["output"],axis=1)
Y=df_s["output"]

In [None]:
#split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=700).fit(X_train,y_train)
prdes=model.predict(X_test)


In [None]:
import matplotlib.pyplot as plt
feature_imp=pd.Series(model.feature_importances_,index=X.columns)
feature_imp.plot(kind='barh')
#feature_imp.nlargest(10), if you have lots of features.

In [None]:
#list=df[["age","trtbps","chol","thalachh"]]#get multiple columns
#for i in list:
   # df_s[i]=StandardScaler().fit_transform(df_s[[i]])
#scaler=StandardScaler()
#X_s=pd.DataFrame(scaler.fit_transform(X))#dataframesiz array şeklinde oluyor.
    
    



## feature selection

In [None]:
X_train

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier


In [None]:
best_features=feature_imp.nlargest(8).index
best_features

In [None]:
X_reduced=X[best_features]
X_reduced


In [None]:
Xrd_scale=StandardScaler().fit_transform(X_reduced)
Xrd_train,Xrd_test,yrd_train,yrd_test=train_test_split(Xrd_scale,Y,test_size=0.2,random_state=42)

In [None]:
rd_model=RandomForestClassifier(n_estimators=700).fit(Xrd_train,yrd_train)
rpreds=rd_model.predict(Xrd_test)

In [None]:
plt.figure(figsize=(10, 10), dpi=400)
sns.heatmap(X_reduced.corr().abs(), annot=True)
#dpi çözünürlük


# Modelling and Fine Tune Model

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(Xrd_train,yrd_train)
preds=knn.predict(Xrd_test)

In [None]:
#accuracy
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print("Accuracy for knn : {}".format(accuracy_score(yrd_test,preds)))
print("F1 score for knn : {}".format(f1_score(yrd_test,preds)))
print("confusion matrix for knn : {}".format(confusion_matrix(yrd_test,preds)))

In [None]:
#best k for Knn

train_score=[]
test_score=[]

for i in range(1,15):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(Xrd_train,yrd_train)
    
    train_score.append(knn.score(Xrd_train,yrd_train))
    test_score.append(knn.score(Xrd_test,yrd_test))



In [None]:
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_score,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_score,marker='o',label='Test Score')


* The best k is 11

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_2=KNeighborsClassifier(n_neighbors=11)
knn_2.fit(Xrd_train,yrd_train)
preds=knn_2.predict(Xrd_test)

In [None]:
#try with cross vali
from sklearn.model_selection import cross_val_score
scores=cross_val_score(knn_2,Xrd_train,yrd_train,cv=10,scoring="accuracy")
scores


In [None]:
scores.mean()

In [None]:
from sklearn.model_selection import GridSearchCV
#In case of classifier like knn the parameter to be tuned is n_neighbors
param_grid = {'n_neighbors':np.arange(1,50),'metric':['euclidean','manhattan'],'weights':['uniform','distance']}
knn_3 = KNeighborsClassifier()
knn_cv= GridSearchCV(knn_3,param_grid,cv=5,scoring='accuracy')
knn_cv.fit(Xrd_train,yrd_train)

print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))

In [None]:
cvres = knn_cv.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(random_state=42)

In [None]:
parameter={'max_leaf_nodes': range(2, 10), 'max_depth': range(1,7), 'min_samples_split' : [2,3,4]}
grid_searchDT = GridSearchCV(dt,parameter,cv=5)
                             
grid_searchDT.fit(Xrd_train,yrd_train)                                  
#preds3=grid_searchDT.predict(Xrd_test)

In [None]:
grid_searchDT.best_params_

In [None]:
grid_searchDT.best_estimator_

In [None]:
grid_searchDT.best_score_

In [None]:
#try with cross vali
from sklearn.model_selection import cross_val_score
scores=cross_val_score(grid_searchDT,Xrd_train,yrd_train,cv=10,scoring="accuracy")


In [None]:
scores.mean()

In [None]:
tree = DecisionTreeClassifier(max_depth=4, random_state=42,max_leaf_nodes=8,min_samples_split= 2)
tree.fit(Xrd_train, yrd_train)

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file="tree.dot", class_names=["0", "1"],
                feature_names=None, impurity=False, filled=True)

In [None]:
import graphviz

with open("tree.dot") as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(random_state=42)
params=[
       {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
       {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]}]
    

params=GridSearchCV(rf,params,cv=5,scoring='accuracy')
params.fit(Xrd_train,yrd_train)


In [None]:
params.best_params_

In [None]:
params.best_estimator_

In [None]:
params.best_score_

In [None]:
from sklearn.model_selection import cross_val_score
scoresRF=cross_val_score(params,Xrd_train,yrd_train,cv=10,scoring="accuracy")

In [None]:
scoresRF.mean()