In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [None]:
import warnings
warnings.filterwarnings('ignore')

sns.set_palette = ('pastel')
sns.set(style = 'whitegrid')

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().transpose()

In [None]:
df.isna().sum()

In [None]:
#correlation
plt.figure(figsize = (14, 8))
ax = sns.heatmap(df.corr(), vmin=-1 ,vmax=1, center=0, annot=True, cmap='Blues')
plt.show()

In [None]:
plt.figure(figsize=(14, 10))
sns.displot(x = df["age"])
plt.title("Distribution of Age", fontsize=20)
plt.show()

In [None]:
sns.displot(x = df["trtbps"])
plt.title("Distribution of Blood Pressure")
plt.xlabel("Blood Pressure", fontsize=10)
plt.ylabel("Count", fontsize=10)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
sns.lineplot(y="trtbps",x="age",data=df)
plt.title("BLOOD PRESSURE WITH AGE",fontsize=10)
plt.xlabel("AGE",fontsize=10)
plt.ylabel("BLOOD PRESSURE",fontsize=10)
plt.show()

In [None]:
plt.figure(figsize= (10, 8))
ax = sns.countplot(x = 'sex', hue = 'output', data = df)
ax.set_xticklabels(['female', 'male'])
plt.xlabel('gender')
plt.legend(loc = 'upper left', labels = ['less chance of heart attack', 'more chance of heart attack'])
plt.show()

In [None]:
df[['age', 'chol']].plot(kind='scatter', x='age' ,y='chol', figsize=(8,6))
plt.show()

In [None]:
plt.figure(figsize= (10, 8))
ax = sns.countplot(x = 'cp', hue = 'output', data = df)
ax.set_xticklabels(['typical angina', 'atypical angina', 'non-anginal pain', 'asymptomatic'])
plt.xlabel('chest pain type')
plt.legend(loc = 'upper right', labels = ['less chance of heart attack', 'more chance of heart attack'])
plt.show()

In [None]:
# categorical column 
cat_cols = ["sex", "exng", "cp", "fbs", "restecg", "thall", "slp", "caa"]

# continuous column
con_cols = ["age", "trtbps", "chol", "thalachh", "oldpeak"]

# target 
target_col = ["output"]

In [None]:
df[con_cols].describe().transpose() # there are outliers in data

In [None]:
#replace outliers with the mean

for col in con_cols:
    df_col = df[col]
    Q1 = df_col.quantile(0.25)
    Q3 = df_col.quantile(0.75)
    IQR = Q3 - Q1

    lower_lim = Q1 - 1.5 * IQR
    upper_lim = Q3 + 1.5 * IQR

    

    df_col = pd.DataFrame(df_col)
    
    outlier = (df_col < lower_lim) | (df_col > upper_lim)
    
    
    mean = df_col.mean()
    df_col[outlier] = np.nan
    df_col.fillna(mean,inplace=True)
    
    df[col] = df_col
   

In [None]:
df[con_cols].describe().transpose() 

In [None]:
df1 = df

df1 = pd.get_dummies(df1, columns = cat_cols, drop_first = True)

X = df1.drop(['output'], axis=1)
y = df1[['output']]

scaler = preprocessing.StandardScaler()
X[con_cols] = scaler.fit_transform(X[con_cols])

X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
print("The shape of X_train is      ", X_train.shape)
print("The shape of X_test is       ",X_test.shape)
print("The shape of y_train is      ",y_train.shape)
print("The shape of y_test is       ",y_test.shape)

In [None]:
score = []
models = [
          SVC(),
          LogisticRegression(),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          GradientBoostingClassifier()
          ]

for model in models:
    f = model.fit(X_train, y_train)
    y_pred = f.predict(X_test)
    score.append(accuracy_score(y_test, y_pred))
    
for i in range(len(score)):
    print(f"{models[i]}: {score[i]}")


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_proba = logreg.predict_proba(X_test)

# calculating the probabilities
y_pred_prob = logreg.predict_proba(X_test)[:,1]


fpr,tpr,threshols=roc_curve(y_test,y_pred_prob)


logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistric Regression ROC Curve')
plt.legend(loc = 'lower right')
plt.show()