In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import warnings; warnings.simplefilter('ignore')

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix , classification_report
from sklearn.model_selection import KFold , cross_val_score
from numpy import mean
from numpy import std
from sklearn.model_selection import GridSearchCV

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()


In [None]:
cor_mat = df.corr()

plt.figure(figsize=(15,10))
top_corr_features = cor_mat.index
sns.heatmap(cor_mat[top_corr_features].corr(),annot=True,cmap="coolwarm") 


In [None]:
# Separating our independent and dependent variables
X = df.drop(['output'], axis = 1)
y = df['output']

#Selecting features with chi2 test
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['features','Score']
display(featureScores.nlargest(15,'Score'))

    

In [None]:
#Splitting the dataframe into training and testing
X = df[['thalachh','oldpeak','caa','cp','exng']]
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.7,random_state=17,shuffle=True) 


#Evaluating the model using k-fold cross validation
rf = RandomForestClassifier()
cv = KFold(n_splits=5, random_state=1, shuffle=True)

scores = cross_val_score(rf, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
print('Accuracy : '+ str(round(mean(scores),2))+'  Std Deviation :'+str(round(std(scores),2)))
print(scores)


In [None]:
#Finding the best parameters using GridSearchCV

parameters = [{'n_estimators':[10,50,100,150,200,250],'criterion':['gini','entropy'],'max_features':['auto','sqrt','log2']}]
clf = GridSearchCV(rf,parameters,scoring='f1')
clf.fit(X_train,y_train)

In [None]:
#Printing the results

y_pred = clf.predict(X_test)
print((classification_report(y_test,y_pred)))

In [None]:
#Confusion matrix and Analysis
#false negatives have to be kept low as we can't risk predicting positive patients as negative on the other hand false positives are acceptable as they can be corrected
cm = confusion_matrix(y_test,clf.predict(X_test)) 
cm